SkalskiP commited on
Commit
089bf89
1 Parent(s): 76abf0b

initial commit with Florence-2

Browse files
Files changed (6) hide show
  1. .gitignore +2 -0
  2. app.py +74 -0
  3. requirements-local.txt +9 -0
  4. requirements.txt +8 -0
  5. utils/__init__.py +0 -0
  6. utils/florence.py +55 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ /venv
2
+ /.idea
app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple
2
+
3
+ import gradio as gr
4
+ import supervision as sv
5
+ import torch
6
+ from PIL import Image
7
+
8
+ from utils.florence import load_model, run_inference, FLORENCE_DETAILED_CAPTION_TASK, \
9
+ FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK
10
+
11
+ MARKDOWN = """
12
+ # Florence-2 + SAM2 🔥
13
+ """
14
+
15
+ DEVICE = torch.device("cpu")
16
+
17
+ FLORENCE_MODEL, FLORENCE_PROCESSOR = load_model(device=DEVICE)
18
+ BOX_ANNOTATOR = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX)
19
+ LABEL_ANNOTATOR = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX)
20
+
21
+
22
+ def process(
23
+ image_input,
24
+ ) -> Tuple[Image.Image, str]:
25
+ _, result = run_inference(
26
+ model=FLORENCE_MODEL,
27
+ processor=FLORENCE_PROCESSOR,
28
+ device=DEVICE,
29
+ image=image_input,
30
+ task=FLORENCE_DETAILED_CAPTION_TASK
31
+ )
32
+ caption = result[FLORENCE_DETAILED_CAPTION_TASK]
33
+ _, result = run_inference(
34
+ model=FLORENCE_MODEL,
35
+ processor=FLORENCE_PROCESSOR,
36
+ device=DEVICE,
37
+ image=image_input,
38
+ task=FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK,
39
+ text=caption
40
+ )
41
+ detections = sv.Detections.from_lmm(
42
+ lmm=sv.LMM.FLORENCE_2,
43
+ result=result,
44
+ resolution_wh=image_input.size
45
+ )
46
+
47
+ output_image = image_input.copy()
48
+ output_image = BOX_ANNOTATOR.annotate(output_image, detections)
49
+ output_image = LABEL_ANNOTATOR.annotate(output_image, detections)
50
+ return output_image, caption
51
+
52
+
53
+ with gr.Blocks() as demo:
54
+ gr.Markdown(MARKDOWN)
55
+ with gr.Row():
56
+ with gr.Column():
57
+ image_input_component = gr.Image(
58
+ type='pil', label='Upload image')
59
+ submit_button_component = gr.Button(value='Submit', variant='primary')
60
+
61
+ with gr.Column():
62
+ image_output_component = gr.Image(type='pil', label='Image output')
63
+ text_output_component = gr.Textbox(label='Caption output')
64
+
65
+ submit_button_component.click(
66
+ fn=process,
67
+ inputs=[image_input_component],
68
+ outputs=[
69
+ image_output_component,
70
+ text_output_component
71
+ ]
72
+ )
73
+
74
+ demo.launch(debug=False, show_error=True, max_threads=1)
requirements-local.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ einops
3
+ spaces
4
+ timm
5
+ transformers
6
+ samv2
7
+ gradio
8
+ supervision
9
+ opencv-python
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ einops
2
+ spaces
3
+ timm
4
+ transformers
5
+ samv2
6
+ gradio
7
+ supervision
8
+ opencv-python
utils/__init__.py ADDED
File without changes
utils/florence.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Union, Any, Tuple, Dict
3
+ from unittest.mock import patch
4
+
5
+ import torch
6
+ from PIL import Image
7
+ from transformers import AutoModelForCausalLM, AutoProcessor
8
+ from transformers.dynamic_module_utils import get_imports
9
+
10
+ FLORENCE_CHECKPOINT = "microsoft/Florence-2-base"
11
+ FLORENCE_DETAILED_CAPTION_TASK = '<MORE_DETAILED_CAPTION>'
12
+ FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK = '<CAPTION_TO_PHRASE_GROUNDING>'
13
+
14
+
15
+ def fixed_get_imports(filename: Union[str, os.PathLike]) -> list[str]:
16
+ """Work around for https://huggingface.co/microsoft/phi-1_5/discussions/72."""
17
+ if not str(filename).endswith("/modeling_florence2.py"):
18
+ return get_imports(filename)
19
+ imports = get_imports(filename)
20
+ imports.remove("flash_attn")
21
+ return imports
22
+
23
+
24
+ def load_model(
25
+ device: torch.device, checkpoint: str = FLORENCE_CHECKPOINT
26
+ ) -> Tuple[Any, Any]:
27
+ with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports):
28
+ model = AutoModelForCausalLM.from_pretrained(
29
+ checkpoint, trust_remote_code=True).to(device).eval()
30
+ processor = AutoProcessor.from_pretrained(
31
+ checkpoint, trust_remote_code=True)
32
+ return model, processor
33
+
34
+
35
+ def run_inference(
36
+ model: Any,
37
+ processor: Any,
38
+ device: torch.device,
39
+ image: Image,
40
+ task: str,
41
+ text: str = ""
42
+ ) -> Tuple[str, Dict]:
43
+ prompt = task + text
44
+ inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
45
+ generated_ids = model.generate(
46
+ input_ids=inputs["input_ids"],
47
+ pixel_values=inputs["pixel_values"],
48
+ max_new_tokens=1024,
49
+ num_beams=3
50
+ )
51
+ generated_text = processor.batch_decode(
52
+ generated_ids, skip_special_tokens=False)[0]
53
+ response = processor.post_process_generation(
54
+ generated_text, task=task, image_size=image.size)
55
+ return generated_text, response