Upload processor

Browse files

Files changed (9) hide show

.gitattributes +1 -0
added_tokens.json +3 -0
paligemma_model.py +229 -0
preprocessor_config.json +43 -0
processor_config.json +6 -0
special_tokens_map.json +39 -0
tokenizer.json +3 -0
tokenizer.model +3 -0
tokenizer_config.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "<image>": 257152
+}

paligemma_model.py ADDED Viewed

	@@ -0,0 +1,229 @@

+import re
+from functools import cache
+import torch
+from PIL.Image import Image
+from transformers import PaliGemmaConfig as HFPaliGemmaConfig
+from transformers import PaliGemmaForConditionalGeneration as HFPaliGemmaForConditionalGeneration
+from transformers import PaliGemmaProcessor as HFPaliGemmaProcessor
+from transformers.utils import TensorType
+from pretrain_mm.processor.processor import ProcessorMixin, TextProcessorMixin
+from pretrain_mm.processor.tokenizer_constants import SetConstants, TokenizerConstants
+from pretrain_mm.utils.token_tag_utils import TagType, box_pattern, point_pattern, segment_str
+"""
+note, for patching this model atm need to fix casting in PaliGemmaProcessor._merge_input_ids_with_image_features
+I am not sure exactly which devices/where the issue arises from so ended up just casting multiple things as otherwise
+debugging/fixing on borah is a pain
+if edited in place on borah, location @
+`/bsuhome/gannett/mambaforge/envs/pt/lib/python3.11/site-packages/transformers/models/paligemma/modeling_paligemma.py`
+then remove the cast
+spaces with actual working implementation of seg/detect/etc
+- https://huggingface.co/spaces/big-vision/paligemma-hf
+    - this one has the VAE for decoding to mask
+- https://huggingface.co/spaces/big-vision/paligemma
+    - this one uses the big-vision stuff which is not ideal
+models:
+https://huggingface.co/google/paligemma-3b-ft-docvqa-896
+https://huggingface.co/google/paligemma-3b-ft-ocrvqa-896
+"""
+MODEL_ID: str = "google/paligemma-3b-ft-docvqa-896"
+PROCESSOR_IMAGE_MAX_SIZE: int = 1024  # loc/seg must be scaled to this max
+_r_loc = r"<loc(\d{4})>"
+re_loc = re.compile(_r_loc)
+re_loc_point = re.compile(_r_loc * 2)
+re_loc_box = re.compile(_r_loc * 4)
+re_seg = re.compile(r"<seg(\d{3})>")
+def _scale_val(val: int, dim_scale_factor: int, max_size: int = PROCESSOR_IMAGE_MAX_SIZE):
+    return min(int(val * dim_scale_factor), max_size)
+def _make_seg_text(val: int, tag: str = "loc", digits: int = 4):
+    return f"<{tag}{val:0>{digits}}>"
+@cache
+def _make_scale_dim_func(image_dim: int, max_size: int = PROCESSOR_IMAGE_MAX_SIZE):
+    # image_dim is either height or width
+    def func(*vals: int):
+        return [round((int(val) / max_size) * image_dim) for val in vals]
+    return func
+class PaliGemmaConfig(HFPaliGemmaConfig):
+    pass
+class PaliGemmaConstantsClass(TokenizerConstants):
+    # from the processor.tokenizer
+    bos_token: str = "<bos>"
+    eos_token: str = "<eos>"
+    image_placeholder_token: str = "<image>"
+    repr_bbox_open_text: str = "<box>"
+    repr_bbox_close_text: str = "</box>"
+    repr_point_open_text: str = "<point>"
+    repr_point_close_text: str = "</point>"
+PaliGemmaConstants = PaliGemmaConstantsClass()
+class PaliGemmaForConditionalGeneration(HFPaliGemmaForConditionalGeneration):
+    pass
+@SetConstants(PaliGemmaConstants)
+class PaliGemmaProcessor(HFPaliGemmaProcessor, ProcessorMixin, TextProcessorMixin):
+    constants: PaliGemmaConstantsClass
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._call = super().__call__
+    def __call__(
+        self,
+        text=None,
+        images=None,
+        tokenize_newline_separately=True,
+        padding=False,
+        truncation=None,
+        max_length=None,
+        return_tensors=TensorType.PYTORCH,
+        do_resize=None,
+        do_normalize=None,
+        image_mean=None,
+        image_std=None,
+        data_format="channels_first",
+        input_data_format=None,
+        resample: "PILImageResampling" = None,  # noqa: F821 # type: ignore
+        do_convert_rgb: bool = None,
+        do_thumbnail: bool = None,
+        do_align_long_axis: bool = None,
+        do_rescale: bool = None,
+        suffix=None,
+        extra: dict | bool = False,
+        **kwargs,
+    ):
+        suffix = suffix or kwargs.get("label", None)
+        if text:
+            text = self.preprocess_text(text, images)
+        if suffix:
+            suffix = self.preprocess_text(suffix, images)
+        batch = super().__call__(
+            text=text,
+            images=images,
+            tokenize_newline_separately=tokenize_newline_separately,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            return_tensors=return_tensors,
+            do_resize=do_resize,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            resample=resample,
+            do_convert_rgb=do_convert_rgb,
+            do_thumbnail=do_thumbnail,
+            do_align_long_axis=do_align_long_axis,
+            do_rescale=do_rescale,
+            suffix=suffix,
+        )
+        batch = self.create_attachable(batch, extra)(text=text, images=images, label=suffix)
+        return batch
+    def decode(self, outputs: torch.Tensor, do_post: bool = True, **kwargs) -> str:
+        """this is specific to PaliGemma"""
+        # converts the tokens to text
+        outputs = self.tokenizer.decode(outputs, **kwargs)
+        return outputs
+    def preprocess_text(
+        self,
+        text: str,
+        images: list[torch.Tensor | Image] | Image = None,
+        max_size: int = PROCESSOR_IMAGE_MAX_SIZE,
+    ) -> str:
+        # not sure what to do for multiple images if need to scale
+        if isinstance(images, list):
+            images = images[0]
+        if images is not None:
+            image_width, image_height = images.size
+            height_scale = max_size / image_height
+            width_scale = max_size / image_width
+        segments = segment_str(text, box_pattern=box_pattern, point_pattern=point_pattern)
+        out_text = ""
+        for seg, seg_type in segments:
+            if seg_type:
+                if seg_type == TagType.POINT:
+                    x, y = map(int, seg)
+                    # Scale the coordinates
+                    scaled_x = _make_seg_text(_scale_val(x, width_scale, max_size))
+                    scaled_y = _make_seg_text(_scale_val(y, height_scale, max_size))
+                    # model uses y, x in examples
+                    scaled_toks = f"{scaled_y}{scaled_x} point"
+                elif seg_type == TagType.BOX:
+                    x1, y1, x2, y2 = map(int, seg)
+                    # Scale the coordinates
+                    scaled_x1 = _make_seg_text(_scale_val(x1, width_scale, max_size))
+                    scaled_y1 = _make_seg_text(_scale_val(y1, height_scale, max_size))
+                    scaled_x2 = _make_seg_text(_scale_val(x2, width_scale, max_size))
+                    scaled_y2 = _make_seg_text(_scale_val(y2, height_scale, max_size))
+                    # they do y1, x1, y2, x2 in examples
+                    scaled_toks = f"{scaled_y1}{scaled_x1}{scaled_y2}{scaled_x2} box"
+                out_text += scaled_toks
+            else:
+                out_text += seg
+        return out_text
+    def handle_token_loc_seg(self, text: str, image_height: int, image_width: int):
+        _scale_height = _make_scale_dim_func(image_height)
+        _scale_width = _make_scale_dim_func(image_width)
+        box_tags = ("<box>", "</box>")
+        point_tags = ("<point>", "</point>")
+        def _make_text(tag_open: str, tag_close: str, *vals):
+            return (
+                text[: tag_open[1]] + f"{tag_open[0]}{', '.join(map(str, vals))}{tag_close[0]}" + text[tag_close[1] :]
+            )
+        def _make_yx(points: list[int]):
+            return _scale_height(*points[0::2]), _scale_width(*points[1::2])
+        while loc_match := re_loc_box.match(text):
+            start_idx, end_idx = zip(box_tags, loc_match.span())
+            (y1, y2), (x1, x2) = _make_yx(list(loc_match.groups()))
+            text = _make_text(start_idx, end_idx, y1, x1, y2, x2)
+        while loc_match := re_loc_point.match(text):
+            tag_open, tag_close = zip(point_tags, loc_match.span())
+            (y1,), (x1,) = _make_yx(list(loc_match.groups()))
+            text = _make_text(tag_open, tag_close, y1, x1)
+        return text

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "_valid_processor_keys": [
+    "images",
+    "do_resize",
+    "size",
+    "resample",
+    "do_rescale",
+    "rescale_factor",
+    "do_normalize",
+    "image_mean",
+    "image_std",
+    "return_tensors",
+    "data_format",
+    "input_data_format",
+    "do_convert_rgb"
+  ],
+  "auto_map": {
+    "AutoProcessor": "paligemma_model.PaliGemmaProcessor"
+  },
+  "do_convert_rgb": null,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "SiglipImageProcessor",
+  "image_seq_length": 4096,
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "processor_class": "PaliGemmaProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 896,
+    "width": 896
+  }
+}

processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "auto_map": {
+    "AutoProcessor": "paligemma_model.PaliGemmaProcessor"
+  },
+  "processor_class": "PaliGemmaProcessor"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8104de0e0f9b8ab923ac66b31bee4ae132edf05863545fa4a3b69b4774117ae
+size 17763304

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8986bb4f423f07f8c7f70d0dbe3526fb2316056c17bae71b1ea975e77a168fc6
+size 4264023

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff