Spaces:

SkalskiP
/

better-florence-2

Running on Zero

App Files Files Community

better-florence-2 / app.py

SkalskiP

cuda

a3dae10 7 months ago

raw

history blame contribute delete

7.98 kB

	from typing import Tuple, Optional

	import gradio as gr
	import spaces
	import supervision as sv
	import torch
	from PIL import Image
	from gradio_image_prompter import ImagePrompter

	from utils.annotate import annotate_with_boxes
	from utils.models import load_models, run_inference, CHECKPOINTS, \
	pre_process_region_task_input, post_process_region_output
	from utils.tasks import TASK_NAMES, TASKS, OBJECT_DETECTION_TASK_NAME, \
	CAPTION_TASK_NAME, DETAILED_CAPTION_TASK_NAME, \
	MORE_DETAILED_CAPTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME, OCR_TASK_NAME, \
	IMAGE_INPUT_TASK_NAMES, IMAGE_PROMPTER_INPUT_TASK_NAMES, IMAGE_OUTPUT_TASK_NAMES, \
	TEXTBOX_OUTPUT_TASK_NAMES, IMAGE_TO_IMAGE_TASK_NAMES, IMAGE_TO_TEXT_TASK_NAMES, \
	IMAGE_PROMPT_TO_IMAGE_TASK_NAMES, REGION_PROPOSAL_TASK_NAME, \
	DENSE_REGION_CAPTION_TASK_NAME

	MARKDOWN = """
	# Better Florence-2 Playground 🔥
	<div>
	<a href="https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/how-to-finetune-florence-2-on-detection-dataset.ipynb">
	<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Colab" style="display:inline-block;">
	</a>
	<a href="https://blog.roboflow.com/florence-2/">
	<img src="https://raw.githubusercontent.com/roboflow-ai/notebooks/main/assets/badges/roboflow-blogpost.svg" alt="Roboflow" style="display:inline-block;">
	</a>
	<a href="https://arxiv.org/abs/2311.06242">
	<img src="https://img.shields.io/badge/arXiv-2311.06242-b31b1b.svg" alt="arXiv" style="display:inline-block;">
	</a>
	<a href="https://www.youtube.com/watch?v=i3KjYgxNH6w">
	<img src="https://badges.aleen42.com/src/youtube.svg" alt="YouTube" style="display:inline-block;">
	</a>
	</div>

	Florence-2 is a lightweight vision-language model open-sourced by Microsoft under the
	MIT license. The model demonstrates strong zero-shot and fine-tuning capabilities
	across tasks such as captioning, object detection, grounding, and segmentation.

	The model takes images and task prompts as input, generating the desired results in
	text format. It uses a DaViT vision encoder to convert images into visual token
	embeddings. These are then concatenated with BERT-generated text embeddings and
	processed by a transformer-based multi-modal encoder-decoder to generate the response.
	"""
	EXAMPLES = [
	["microsoft/Florence-2-large-ft", OBJECT_DETECTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
	["microsoft/Florence-2-large-ft", REGION_PROPOSAL_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", None],
	["microsoft/Florence-2-large-ft", DENSE_REGION_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", None],
	["microsoft/Florence-2-large-ft", CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
	["microsoft/Florence-2-large-ft", DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
	["microsoft/Florence-2-large-ft", MORE_DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
	["microsoft/Florence-2-large-ft", OCR_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg", None],
	["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg", None],
	["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/inference/license_plate_1.jpg", None],
	]

	# DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	DEVICE = "cuda"
	MODELS, PROCESSORS = load_models(DEVICE)


	@spaces.GPU
	def process(
	checkpoint_dropdown,
	task_dropdown,
	image_input,
	image_prompter_input
	) -> Tuple[Optional[Image.Image], Optional[str]]:
	model = MODELS[checkpoint_dropdown]
	processor = PROCESSORS[checkpoint_dropdown]
	task = TASKS[task_dropdown]

	if task_dropdown in IMAGE_TO_IMAGE_TASK_NAMES:
	_, response = run_inference(
	model, processor, DEVICE, image_input, task)
	detections = sv.Detections.from_lmm(
	lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
	return annotate_with_boxes(image_input, detections), None

	elif task_dropdown in IMAGE_TO_TEXT_TASK_NAMES:
	_, response = run_inference(
	model, processor, DEVICE, image_input, task)
	return None, response[task]

	elif task_dropdown in IMAGE_PROMPT_TO_IMAGE_TASK_NAMES:
	detections_list = []

	print(image_prompter_input)

	image_input = image_prompter_input["image"]
	for prompt in image_prompter_input["points"]:
	text = pre_process_region_task_input(
	prompt=prompt,
	resolution_wh=image_input.size
	)
	_, response = run_inference(
	model, processor, DEVICE, image_input, task, text)
	detections = sv.Detections.from_lmm(
	lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
	detections_list.append(detections)
	detections = sv.Detections.merge(detections_list=detections_list)
	detections = post_process_region_output(
	detections=detections, resolution_wh=image_input.size)

	return annotate_with_boxes(image_input, detections), None


	with gr.Blocks() as demo:
	gr.Markdown(MARKDOWN)
	with gr.Row():
	checkpoint_dropdown_component = gr.Dropdown(
	choices=CHECKPOINTS,
	value=CHECKPOINTS[0],
	label="Model", info="Select a Florence 2 model to use.",
	interactive=True
	)
	task_dropdown_component = gr.Dropdown(
	choices=TASK_NAMES,
	value=TASK_NAMES[0],
	label="Task", info="Select a task to perform with the model.",
	interactive=True
	)

	with gr.Row():
	with gr.Column():
	image_input_component = gr.Image(
	type='pil', label='Upload image')
	image_prompter_input_component = ImagePrompter(
	type='pil', label='Image prompt', visible=False)
	submit_button_component = gr.Button(value='Submit', variant='primary')

	with gr.Column():
	image_output_component = gr.Image(type='pil', label='Image Output')
	text_output_component = gr.Textbox(label='Caption Output', visible=False)
	with gr.Row():
	gr.Examples(
	fn=process,
	examples=EXAMPLES,
	inputs=[
	checkpoint_dropdown_component,
	task_dropdown_component,
	image_input_component,
	image_prompter_input_component
	],
	outputs=[
	image_output_component,
	text_output_component
	],
	run_on_click=True
	)

	def on_dropdown_change(text):
	return [
	gr.Image(visible=text in IMAGE_INPUT_TASK_NAMES),
	ImagePrompter(visible=text in IMAGE_PROMPTER_INPUT_TASK_NAMES),
	gr.Image(visible=text in IMAGE_OUTPUT_TASK_NAMES),
	gr.Textbox(visible=text in TEXTBOX_OUTPUT_TASK_NAMES)
	]

	task_dropdown_component.change(
	on_dropdown_change,
	inputs=[task_dropdown_component],
	outputs=[
	image_input_component,
	image_prompter_input_component,
	image_output_component,
	text_output_component
	]
	)
	submit_button_component.click(
	fn=process,
	inputs=[
	checkpoint_dropdown_component,
	task_dropdown_component,
	image_input_component,
	image_prompter_input_component
	],
	outputs=[
	image_output_component,
	text_output_component
	]
	)

	demo.launch(debug=False, show_error=True, max_threads=1)