Spaces:
Running
on
Zero
Running
on
Zero
from typing import Tuple, Optional | |
import gradio as gr | |
import spaces | |
import supervision as sv | |
import torch | |
from PIL import Image | |
from gradio_image_prompter import ImagePrompter | |
from utils.annotate import annotate_with_boxes | |
from utils.models import load_models, run_inference, CHECKPOINTS, \ | |
pre_process_region_task_input, post_process_region_output | |
from utils.tasks import TASK_NAMES, TASKS, OBJECT_DETECTION_TASK_NAME, \ | |
CAPTION_TASK_NAME, DETAILED_CAPTION_TASK_NAME, \ | |
MORE_DETAILED_CAPTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME, OCR_TASK_NAME, \ | |
IMAGE_INPUT_TASK_NAMES, IMAGE_PROMPTER_INPUT_TASK_NAMES, IMAGE_OUTPUT_TASK_NAMES, \ | |
TEXTBOX_OUTPUT_TASK_NAMES, IMAGE_TO_IMAGE_TASK_NAMES, IMAGE_TO_TEXT_TASK_NAMES, \ | |
IMAGE_PROMPT_TO_IMAGE_TASK_NAMES, REGION_PROPOSAL_TASK_NAME, \ | |
DENSE_REGION_CAPTION_TASK_NAME | |
MARKDOWN = """ | |
# Better Florence-2 Playground 🔥 | |
<div> | |
<a href="https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/how-to-finetune-florence-2-on-detection-dataset.ipynb"> | |
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Colab" style="display:inline-block;"> | |
</a> | |
<a href="https://blog.roboflow.com/florence-2/"> | |
<img src="https://raw.githubusercontent.com/roboflow-ai/notebooks/main/assets/badges/roboflow-blogpost.svg" alt="Roboflow" style="display:inline-block;"> | |
</a> | |
<a href="https://arxiv.org/abs/2311.06242"> | |
<img src="https://img.shields.io/badge/arXiv-2311.06242-b31b1b.svg" alt="arXiv" style="display:inline-block;"> | |
</a> | |
<a href="https://www.youtube.com/watch?v=i3KjYgxNH6w"> | |
<img src="https://badges.aleen42.com/src/youtube.svg" alt="YouTube" style="display:inline-block;"> | |
</a> | |
</div> | |
Florence-2 is a lightweight vision-language model open-sourced by Microsoft under the | |
MIT license. The model demonstrates strong zero-shot and fine-tuning capabilities | |
across tasks such as captioning, object detection, grounding, and segmentation. | |
The model takes images and task prompts as input, generating the desired results in | |
text format. It uses a DaViT vision encoder to convert images into visual token | |
embeddings. These are then concatenated with BERT-generated text embeddings and | |
processed by a transformer-based multi-modal encoder-decoder to generate the response. | |
""" | |
EXAMPLES = [ | |
["microsoft/Florence-2-large-ft", OBJECT_DETECTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None], | |
["microsoft/Florence-2-large-ft", REGION_PROPOSAL_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", None], | |
["microsoft/Florence-2-large-ft", DENSE_REGION_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", None], | |
["microsoft/Florence-2-large-ft", CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None], | |
["microsoft/Florence-2-large-ft", DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None], | |
["microsoft/Florence-2-large-ft", MORE_DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None], | |
["microsoft/Florence-2-large-ft", OCR_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg", None], | |
["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg", None], | |
["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/inference/license_plate_1.jpg", None], | |
] | |
# DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
DEVICE = "cuda" | |
MODELS, PROCESSORS = load_models(DEVICE) | |
def process( | |
checkpoint_dropdown, | |
task_dropdown, | |
image_input, | |
image_prompter_input | |
) -> Tuple[Optional[Image.Image], Optional[str]]: | |
model = MODELS[checkpoint_dropdown] | |
processor = PROCESSORS[checkpoint_dropdown] | |
task = TASKS[task_dropdown] | |
if task_dropdown in IMAGE_TO_IMAGE_TASK_NAMES: | |
_, response = run_inference( | |
model, processor, DEVICE, image_input, task) | |
detections = sv.Detections.from_lmm( | |
lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size) | |
return annotate_with_boxes(image_input, detections), None | |
elif task_dropdown in IMAGE_TO_TEXT_TASK_NAMES: | |
_, response = run_inference( | |
model, processor, DEVICE, image_input, task) | |
return None, response[task] | |
elif task_dropdown in IMAGE_PROMPT_TO_IMAGE_TASK_NAMES: | |
detections_list = [] | |
print(image_prompter_input) | |
image_input = image_prompter_input["image"] | |
for prompt in image_prompter_input["points"]: | |
text = pre_process_region_task_input( | |
prompt=prompt, | |
resolution_wh=image_input.size | |
) | |
_, response = run_inference( | |
model, processor, DEVICE, image_input, task, text) | |
detections = sv.Detections.from_lmm( | |
lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size) | |
detections_list.append(detections) | |
detections = sv.Detections.merge(detections_list=detections_list) | |
detections = post_process_region_output( | |
detections=detections, resolution_wh=image_input.size) | |
return annotate_with_boxes(image_input, detections), None | |
with gr.Blocks() as demo: | |
gr.Markdown(MARKDOWN) | |
with gr.Row(): | |
checkpoint_dropdown_component = gr.Dropdown( | |
choices=CHECKPOINTS, | |
value=CHECKPOINTS[0], | |
label="Model", info="Select a Florence 2 model to use.", | |
interactive=True | |
) | |
task_dropdown_component = gr.Dropdown( | |
choices=TASK_NAMES, | |
value=TASK_NAMES[0], | |
label="Task", info="Select a task to perform with the model.", | |
interactive=True | |
) | |
with gr.Row(): | |
with gr.Column(): | |
image_input_component = gr.Image( | |
type='pil', label='Upload image') | |
image_prompter_input_component = ImagePrompter( | |
type='pil', label='Image prompt', visible=False) | |
submit_button_component = gr.Button(value='Submit', variant='primary') | |
with gr.Column(): | |
image_output_component = gr.Image(type='pil', label='Image Output') | |
text_output_component = gr.Textbox(label='Caption Output', visible=False) | |
with gr.Row(): | |
gr.Examples( | |
fn=process, | |
examples=EXAMPLES, | |
inputs=[ | |
checkpoint_dropdown_component, | |
task_dropdown_component, | |
image_input_component, | |
image_prompter_input_component | |
], | |
outputs=[ | |
image_output_component, | |
text_output_component | |
], | |
run_on_click=True | |
) | |
def on_dropdown_change(text): | |
return [ | |
gr.Image(visible=text in IMAGE_INPUT_TASK_NAMES), | |
ImagePrompter(visible=text in IMAGE_PROMPTER_INPUT_TASK_NAMES), | |
gr.Image(visible=text in IMAGE_OUTPUT_TASK_NAMES), | |
gr.Textbox(visible=text in TEXTBOX_OUTPUT_TASK_NAMES) | |
] | |
task_dropdown_component.change( | |
on_dropdown_change, | |
inputs=[task_dropdown_component], | |
outputs=[ | |
image_input_component, | |
image_prompter_input_component, | |
image_output_component, | |
text_output_component | |
] | |
) | |
submit_button_component.click( | |
fn=process, | |
inputs=[ | |
checkpoint_dropdown_component, | |
task_dropdown_component, | |
image_input_component, | |
image_prompter_input_component | |
], | |
outputs=[ | |
image_output_component, | |
text_output_component | |
] | |
) | |
demo.launch(debug=False, show_error=True, max_threads=1) | |