onnx-community
/

Florence-2-base-ft

@@ -1,39 +1,86 @@
 {
-  "auto_map": {
-    "AutoProcessor": "processing_florence2.Florence2Processor"
-   },
   "_valid_processor_keys": [
     "images",
     "do_resize",
     "size",
     "resample",
     "do_rescale",
     "rescale_factor",
     "do_normalize",
     "image_mean",
     "image_std",
     "return_tensors",
     "data_format",
-    "input_data_format",
-    "do_convert_rgb"
   ],
   "do_convert_rgb": null,
   "do_normalize": true,
   "do_rescale": true,
   "do_resize": true,
-  "do_center_crop": false,
   "image_processor_type": "CLIPImageProcessor",
   "image_seq_length": 577,
-  "image_mean": [0.485, 0.456, 0.406],
-  "image_std":  [0.229, 0.224, 0.225],
   "processor_class": "Florence2Processor",
   "resample": 3,
   "size": {
-    "height": 768,
-    "width":768
-  },
-  "crop_size": {
     "height": 768,
     "width": 768
   }
-}

 {
   "_valid_processor_keys": [
     "images",
     "do_resize",
     "size",
     "resample",
+    "do_center_crop",
+    "crop_size",
     "do_rescale",
     "rescale_factor",
     "do_normalize",
     "image_mean",
     "image_std",
+    "do_convert_rgb",
     "return_tensors",
     "data_format",
+    "input_data_format"
   ],
+  "auto_map": {
+    "AutoProcessor": "processing_florence2.Florence2Processor"
+  },
+  "crop_size": {
+    "height": 768,
+    "width": 768
+  },
+  "do_center_crop": false,
   "do_convert_rgb": null,
   "do_normalize": true,
   "do_rescale": true,
   "do_resize": true,
+  "image_mean": [
+    0.485,
+    0.456,
+    0.406
+  ],
   "image_processor_type": "CLIPImageProcessor",
   "image_seq_length": 577,
+  "image_std": [
+    0.229,
+    0.224,
+    0.225
+  ],
   "processor_class": "Florence2Processor",
   "resample": 3,
+  "rescale_factor": 0.00392156862745098,
   "size": {
     "height": 768,
     "width": 768
+  },
+  "tasks_answer_post_processing_type": {
+    "<OCR>": "pure_text",
+    "<OCR_WITH_REGION>": "ocr",
+    "<CAPTION>": "pure_text",
+    "<DETAILED_CAPTION>": "pure_text",
+    "<MORE_DETAILED_CAPTION>": "pure_text",
+    "<OD>": "description_with_bboxes",
+    "<DENSE_REGION_CAPTION>": "description_with_bboxes",
+    "<CAPTION_TO_PHRASE_GROUNDING>": "phrase_grounding",
+    "<REFERRING_EXPRESSION_SEGMENTATION>": "polygons",
+    "<REGION_TO_SEGMENTATION>": "polygons",
+    "<OPEN_VOCABULARY_DETECTION>": "description_with_bboxes_or_polygons",
+    "<REGION_TO_CATEGORY>": "pure_text",
+    "<REGION_TO_DESCRIPTION>": "pure_text",
+    "<REGION_TO_OCR>": "pure_text",
+    "<REGION_PROPOSAL>": "bboxes"
+  },
+  "task_prompts_without_inputs": {
+    "<OCR>": "What is the text in the image?",
+    "<OCR_WITH_REGION>": "What is the text in the image, with regions?",
+    "<CAPTION>": "What does the image describe?",
+    "<DETAILED_CAPTION>": "Describe in detail what is shown in the image.",
+    "<MORE_DETAILED_CAPTION>": "Describe with a paragraph what is shown in the image.",
+    "<OD>": "Locate the objects with category name in the image.",
+    "<DENSE_REGION_CAPTION>": "Locate the objects in the image, with their descriptions.",
+    "<REGION_PROPOSAL>": "Locate the region proposals in the image."
+  },
+  "task_prompts_with_input": {
+    "<CAPTION_TO_PHRASE_GROUNDING>": "Locate the phrases in the caption: {input}",
+    "<REFERRING_EXPRESSION_SEGMENTATION>": "Locate {input} in the image with mask",
+    "<REGION_TO_SEGMENTATION>": "What is the polygon mask of region {input}",
+    "<OPEN_VOCABULARY_DETECTION>": "Locate {input} in the image.",
+    "<REGION_TO_CATEGORY>": "What is the region {input}?",
+    "<REGION_TO_DESCRIPTION>": "What does the region {input} describe?",
+    "<REGION_TO_OCR>": "What text is in the region {input}?"
   }
+}