ketanmore
/

ArabicDoc-layout-Detection

Model card Files Files and versions Community

ketanmore commited on Oct 30, 2024

Commit

2720487

verified ·

1 Parent(s): e5f7812

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +53 -0
benchmark.ipynb +0 -0
requirements.txt +299 -0
results/layout-benchmark-results-images-1.jpg +0 -0
results/layout-benchmark-results-images-10.jpg +0 -0
results/layout-benchmark-results-images-2.jpg +0 -0
results/layout-benchmark-results-images-3.jpg +0 -0
results/layout-benchmark-results-images-4.jpg +0 -0
results/layout-benchmark-results-images-5.jpg +0 -0
results/layout-benchmark-results-images-6.jpg +0 -0
results/layout-benchmark-results-images-7.jpg +0 -0
results/layout-benchmark-results-images-8.jpg +0 -0
results/layout-benchmark-results-images-9.jpg +0 -0
surya/__pycache__/detection.cpython-310.pyc +0 -0
surya/__pycache__/layout.cpython-310.pyc +0 -0
surya/__pycache__/ocr.cpython-310.pyc +0 -0
surya/__pycache__/recognition.cpython-310.pyc +0 -0
surya/__pycache__/schema.cpython-310.pyc +0 -0
surya/__pycache__/settings.cpython-310.pyc +0 -0
surya/benchmark/bbox.py +22 -0
surya/benchmark/metrics.py +139 -0
surya/benchmark/tesseract.py +179 -0
surya/benchmark/util.py +31 -0
surya/detection.py +139 -0
surya/input/__pycache__/processing.cpython-310.pyc +0 -0
surya/input/langs.py +19 -0
surya/input/load.py +74 -0
surya/input/processing.py +116 -0
surya/languages.py +101 -0
surya/layout.py +204 -0
surya/model/detection/__pycache__/processor.cpython-310.pyc +0 -0
surya/model/detection/__pycache__/segformer.cpython-310.pyc +0 -0
surya/model/detection/processor.py +284 -0
surya/model/detection/segformer.py +468 -0
surya/model/ordering/config.py +8 -0
surya/model/ordering/decoder.py +557 -0
surya/model/ordering/encoder.py +83 -0
surya/model/ordering/encoderdecoder.py +90 -0
surya/model/ordering/model.py +34 -0
surya/model/ordering/processor.py +156 -0
surya/model/recognition/__pycache__/config.cpython-310.pyc +0 -0
surya/model/recognition/__pycache__/decoder.cpython-310.pyc +0 -0
surya/model/recognition/__pycache__/encoder.cpython-310.pyc +0 -0
surya/model/recognition/__pycache__/model.cpython-310.pyc +0 -0
surya/model/recognition/__pycache__/processor.cpython-310.pyc +0 -0
surya/model/recognition/__pycache__/tokenizer.cpython-310.pyc +0 -0
surya/model/recognition/config.py +111 -0
surya/model/recognition/decoder.py +511 -0
surya/model/recognition/encoder.py +469 -0
surya/model/recognition/model.py +64 -0

README.md ADDED Viewed

	@@ -0,0 +1,53 @@

+---
+license: apache-2.0
+---
+# Suryolo : Layout Model For Arabic Documents
+Suryolo is combination of Surya layout Model form SuryaOCR(based on Segformer) and YoloV10 objection detection.
+## Setup Instructions
+### Clone the Surya OCR GitHub Repository
+```bash
+git clone https://github.com/vikp/surya.git
+cd surya
+```
+### Switch to v0.4.14
+```bash
+git checkout f7c6c04
+```
+### Install Dependencies
+You can install the required dependencies using the following command:
+```bash
+pip install -r requirements.txt
+```
+```bash
+pip install ultralytics
+```
+```bash
+pip install supervision
+```
+### Suryolo Pipeline
+Download `surya_yolo_pipeline.py` file from the Repository.
+```python
+from surya_yolo_pipeline import suryolo
+from surya.postprocessing.heatmap import draw_bboxes_on_image
+image_path = "sample.jpg"
+image  = Image.open(image_path)
+bboxes = suryolo(image_path)
+plotted_image  = draw_bboxes_on_image(bboxes,image)
+```
+#### Refer to `benchmark.ipynb` for comparison between Traditional Surya Layout Model and Suryolo Layout Model.

benchmark.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,299 @@

+absl-py==2.1.0
+accelerate==0.34.2
+addict==2.4.0
+aiofiles==23.2.1
+aiohappyeyeballs==2.4.0
+aiohttp==3.10.5
+aiosignal==1.3.1
+albucore==0.0.17
+albumentations==1.4.18
+altair==5.4.1
+annotated-types==0.7.0
+antlr4-python3-runtime==4.8
+anyio==4.6.0
+appdirs==1.4.4
+astor==0.8.1
+asttokens @ file:///home/conda/feedstock_root/build_artifacts/asttokens_1698341106958/work
+async-timeout==4.0.3
+attrs==24.2.0
+av==13.1.0
+babel==2.16.0
+bce-python-sdk==0.9.23
+bcrypt==4.2.0
+beartype==0.19.0
+beautifulsoup4==4.12.3
+bitsandbytes==0.44.1
+blinker==1.8.2
+boto3==1.35.34
+botocore==1.35.34
+braceexpand==0.1.7
+Brotli @ file:///croot/brotli-split_1714483155106/work
+cachetools==5.5.0
+certifi @ file:///croot/certifi_1725551672989/work/certifi
+cffi==1.17.1
+cfgv==3.4.0
+charset-normalizer @ file:///croot/charset-normalizer_1721748349566/work
+click==8.1.7
+colossalai==0.4.0
+comm @ file:///home/conda/feedstock_root/build_artifacts/comm_1710320294760/work
+contexttimer==0.3.3
+contourpy==1.3.0
+cpm-kernels==1.0.11
+cryptography==43.0.1
+cycler==0.12.1
+Cython==3.0.11
+datasets==3.0.0
+debugpy @ file:///croot/debugpy_1690905042057/work
+decorator==4.4.2
+decord==0.6.0
+deepspeed==0.15.1
+defusedxml==0.7.1
+Deprecated==1.2.14
+diffusers==0.30.3
+dill==0.3.8
+distlib==0.3.8
+distro==1.9.0
+docker-pycreds==0.4.0
+doclayout_yolo==0.0.2
+easydict==1.13
+einops==0.7.0
+entrypoints @ file:///home/conda/feedstock_root/build_artifacts/entrypoints_1643888246732/work
+eval_type_backport==0.2.0
+exceptiongroup @ file:///home/conda/feedstock_root/build_artifacts/exceptiongroup_1720869315914/work
+executing @ file:///home/conda/feedstock_root/build_artifacts/executing_1725214404607/work
+fabric==3.2.2
+faiss-cpu==1.8.0.post1
+fastapi==0.110.0
+ffmpy==0.4.0
+filelock @ file:///croot/filelock_1700591183607/work
+fire==0.6.0
+flash-attn==2.6.3
+Flask==3.0.3
+flask-babel==4.0.0
+fonttools==4.54.1
+frozenlist==1.4.1
+fsspec==2024.6.1
+ftfy==6.2.3
+future==1.0.0
+fvcore==0.1.5.post20221221
+galore-torch==1.0
+gast==0.3.3
+gdown==5.1.0
+gitdb==4.0.11
+GitPython==3.1.43
+gmpy2 @ file:///tmp/build/80754af9/gmpy2_1645455533097/work
+google==3.0.0
+google-auth==2.35.0
+google-auth-oauthlib==1.0.0
+gradio==4.44.1
+gradio_client==1.3.0
+grpcio==1.66.1
+h11==0.14.0
+h5py==3.10.0
+hjson==3.1.0
+httpcore==1.0.5
+httpx==0.27.2
+huggingface-hub==0.25.0
+identify==2.6.1
+idna==3.6
+imageio==2.35.1
+imageio-ffmpeg==0.5.1
+imgaug==0.4.0
+importlib_metadata==8.5.0
+importlib_resources==6.4.5
+invoke==2.2.0
+iopath==0.1.10
+ipykernel @ file:///home/conda/feedstock_root/build_artifacts/ipykernel_1719845459717/work
+ipython @ file:///home/conda/feedstock_root/build_artifacts/ipython_1725050136642/work
+ipywidgets==8.1.5
+itsdangerous==2.2.0
+jedi @ file:///home/conda/feedstock_root/build_artifacts/jedi_1696326070614/work
+Jinja2 @ file:///croot/jinja2_1716993405101/work
+jiter==0.5.0
+jmespath==1.0.1
+joblib==1.4.2
+jsonschema==4.23.0
+jsonschema-specifications==2023.12.1
+jupyter-client @ file:///home/conda/feedstock_root/build_artifacts/jupyter_client_1654730843242/work
+jupyter_core @ file:///home/conda/feedstock_root/build_artifacts/jupyter_core_1727163409502/work
+jupyterlab_widgets==3.0.13
+kiwisolver==1.4.7
+lazy_loader==0.4
+lightning-utilities==0.11.7
+lmdb==1.5.1
+lxml==5.3.0
+Markdown==3.7
+markdown-it-py==3.0.0
+MarkupSafe @ file:///croot/markupsafe_1704205993651/work
+matplotlib==3.7.5
+matplotlib-inline @ file:///home/conda/feedstock_root/build_artifacts/matplotlib-inline_1713250518406/work
+mdurl==0.1.2
+mkl-service==2.4.0
+mkl_fft @ file:///croot/mkl_fft_1725370245198/work
+mkl_random @ file:///croot/mkl_random_1725370241878/work
+mmengine==0.10.5
+moviepy==1.0.3
+mpmath @ file:///croot/mpmath_1690848262763/work
+msgpack==1.1.0
+multidict==6.1.0
+multiprocess==0.70.16
+narwhals==1.9.1
+nest_asyncio @ file:///home/conda/feedstock_root/build_artifacts/nest-asyncio_1705850609492/work
+networkx @ file:///croot/networkx_1717597493534/work
+ninja==1.11.1.1
+nodeenv==1.9.1
+numpy==1.26.0
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-ml-py==12.560.30
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.6.77
+nvidia-nvtx-cu12==12.1.105
+oauthlib==3.2.2
+omegaconf==2.1.1
+openai==1.51.0
+opencv-contrib-python==4.10.0.84
+opencv-python==4.9.0.80
+opencv-python-headless==4.9.0.80
+opensora @ file:///share/data/drive_3/ketan/t2v/Open-Sora
+opt-einsum==3.3.0
+orjson==3.10.7
+packaging @ file:///home/conda/feedstock_root/build_artifacts/packaging_1718189413536/work
+paddleclas==2.5.2
+paddleocr==2.8.1
+paddlepaddle==2.6.2
+pandarallel==1.6.5
+pandas==2.0.3
+parameterized==0.9.0
+paramiko==3.5.0
+parso @ file:///home/conda/feedstock_root/build_artifacts/parso_1712320355065/work
+peft==0.13.0
+pexpect @ file:///home/conda/feedstock_root/build_artifacts/pexpect_1706113125309/work
+pickleshare @ file:///home/conda/feedstock_root/build_artifacts/pickleshare_1602536217715/work
+Pillow==9.5.0
+platformdirs @ file:///home/conda/feedstock_root/build_artifacts/platformdirs_1726613481435/work
+plumbum==1.9.0
+portalocker==2.10.1
+pre_commit==4.0.0
+prettytable==3.11.0
+proglog==0.1.10
+prompt_toolkit @ file:///home/conda/feedstock_root/build_artifacts/prompt-toolkit_1718047967974/work
+protobuf==4.25.5
+psutil @ file:///opt/conda/conda-bld/psutil_1656431268089/work
+ptyprocess @ file:///home/conda/feedstock_root/build_artifacts/ptyprocess_1609419310487/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl
+pure_eval @ file:///home/conda/feedstock_root/build_artifacts/pure_eval_1721585709575/work
+py-cpuinfo==9.0.0
+pyarrow==17.0.0
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pyclipper==1.3.0.post5
+pycparser==2.22
+pycryptodome==3.20.0
+pydantic==2.9.2
+pydantic-settings==2.5.2
+pydantic_core==2.23.4
+pydub==0.25.1
+Pygments @ file:///home/conda/feedstock_root/build_artifacts/pygments_1714846767233/work
+PyNaCl==1.5.0
+pyparsing==3.1.4
+pypdfium2==4.30.0
+PySocks @ file:///home/builder/ci_310/pysocks_1640793678128/work
+python-dateutil @ file:///home/conda/feedstock_root/build_artifacts/python-dateutil_1709299778482/work
+python-docx==1.1.2
+python-dotenv==1.0.1
+python-multipart==0.0.12
+pytorch-lightning==2.2.1
+pytorchvideo==0.1.5
+pytz==2024.2
+PyYAML @ file:///croot/pyyaml_1698096049011/work
+pyzmq @ file:///croot/pyzmq_1705605076900/work
+qudida==0.0.4
+RapidFuzz==3.10.0
+rarfile==4.2
+ray==2.37.0
+referencing==0.35.1
+regex==2023.12.25
+requests==2.32.3
+requests-oauthlib==2.0.0
+rich==13.9.2
+rotary-embedding-torch==0.5.3
+rpds-py==0.20.0
+rpyc==6.0.0
+rsa==4.9
+ruff==0.6.9
+s3transfer==0.10.2
+safetensors==0.4.5
+scikit-image==0.24.0
+scikit-learn==1.3.2
+scikit-video==1.1.11
+scipy==1.10.1
+seaborn==0.13.2
+semantic-version==2.10.0
+sentencepiece==0.2.0
+sentry-sdk==2.15.0
+setproctitle==1.3.3
+shapely==2.0.6
+shellingham==1.5.4
+six @ file:///home/conda/feedstock_root/build_artifacts/six_1620240208055/work
+smmap==5.0.1
+sniffio==1.3.1
+soupsieve==2.6
+spaces==0.30.3
+stack-data @ file:///home/conda/feedstock_root/build_artifacts/stack_data_1669632077133/work
+starlette==0.36.3
+supervision==0.23.0
+SwissArmyTransformer==0.4.12
+sympy @ file:///croot/sympy_1724938189289/work
+tabulate==0.9.0
+tensorboard==2.14.0
+tensorboard-data-server==0.7.2
+tensorboardX==2.6.2.2
+termcolor==2.4.0
+test_tube==0.7.5
+thop==0.1.1.post2209072238
+threadpoolctl==3.5.0
+tifffile==2024.9.20
+timm==0.9.16
+tokenizers==0.20.0
+tomli==2.0.2
+tomlkit==0.12.0
+torch==2.4.1
+torch-lr-finder==0.2.2
+torchaudio==2.4.1
+torchdiffeq==0.2.3
+torchmetrics==1.3.2
+torchvision==0.19.1
+tornado @ file:///home/conda/feedstock_root/build_artifacts/tornado_1648827254365/work
+tqdm==4.66.5
+traitlets @ file:///home/conda/feedstock_root/build_artifacts/traitlets_1713535121073/work
+transformers==4.45.1
+triton==3.0.0
+typer==0.12.5
+typing_extensions @ file:///croot/typing_extensions_1715268824938/work
+tzdata==2024.1
+ujson==5.10.0
+ultralytics==8.3.1
+ultralytics-thop==2.0.8
+urllib3==2.2.1
+uvicorn==0.29.0
+virtualenv==20.26.6
+visualdl==2.5.3
+wandb==0.18.3
+wcwidth @ file:///home/conda/feedstock_root/build_artifacts/wcwidth_1704731205417/work
+webdataset==0.2.100
+websockets==11.0.3
+Werkzeug==3.0.4
+widgetsnbextension==4.0.13
+wrapt==1.16.0
+xxhash==3.5.0
+yacs==0.1.8
+yapf==0.40.2
+yarl==1.11.1
+zipp==3.20.2

results/layout-benchmark-results-images-1.jpg ADDED Viewed

results/layout-benchmark-results-images-10.jpg ADDED Viewed

results/layout-benchmark-results-images-2.jpg ADDED Viewed

results/layout-benchmark-results-images-3.jpg ADDED Viewed

results/layout-benchmark-results-images-4.jpg ADDED Viewed

results/layout-benchmark-results-images-5.jpg ADDED Viewed

results/layout-benchmark-results-images-6.jpg ADDED Viewed

results/layout-benchmark-results-images-7.jpg ADDED Viewed

results/layout-benchmark-results-images-8.jpg ADDED Viewed

results/layout-benchmark-results-images-9.jpg ADDED Viewed

surya/__pycache__/detection.cpython-310.pyc ADDED Viewed

Binary file (5.06 kB). View file

surya/__pycache__/layout.cpython-310.pyc ADDED Viewed

Binary file (6.35 kB). View file

surya/__pycache__/ocr.cpython-310.pyc ADDED Viewed

Binary file (2.79 kB). View file

surya/__pycache__/recognition.cpython-310.pyc ADDED Viewed

Binary file (5.86 kB). View file

surya/__pycache__/schema.cpython-310.pyc ADDED Viewed

Binary file (6.41 kB). View file

surya/__pycache__/settings.cpython-310.pyc ADDED Viewed

Binary file (3.77 kB). View file

surya/benchmark/bbox.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import fitz as pymupdf
+from surya.postprocessing.util import rescale_bbox
+def get_pdf_lines(pdf_path, img_sizes):
+    doc = pymupdf.open(pdf_path)
+    page_lines = []
+    for idx, img_size in enumerate(img_sizes):
+        page = doc[idx]
+        blocks = page.get_text("dict", sort=True, flags=pymupdf.TEXTFLAGS_DICT & ~pymupdf.TEXT_PRESERVE_LIGATURES & ~pymupdf.TEXT_PRESERVE_IMAGES)["blocks"]
+        line_boxes = []
+        for block_idx, block in enumerate(blocks):
+            for l in block["lines"]:
+                line_boxes.append(list(l["bbox"]))
+        page_box = page.bound()
+        pwidth, pheight = page_box[2] - page_box[0], page_box[3] - page_box[1]
+        line_boxes = [rescale_bbox(bbox, (pwidth, pheight), img_size) for bbox in line_boxes]
+        page_lines.append(line_boxes)
+    return page_lines

surya/benchmark/metrics.py ADDED Viewed

	@@ -0,0 +1,139 @@

+from functools import partial
+from itertools import repeat
+import numpy as np
+from concurrent.futures import ProcessPoolExecutor
+def intersection_area(box1, box2):
+    x_left = max(box1[0], box2[0])
+    y_top = max(box1[1], box2[1])
+    x_right = min(box1[2], box2[2])
+    y_bottom = min(box1[3], box2[3])
+    if x_right < x_left or y_bottom < y_top:
+        return 0.0
+    return (x_right - x_left) * (y_bottom - y_top)
+def intersection_pixels(box1, box2):
+    x_left = max(box1[0], box2[0])
+    y_top = max(box1[1], box2[1])
+    x_right = min(box1[2], box2[2])
+    y_bottom = min(box1[3], box2[3])
+    if x_right < x_left or y_bottom < y_top:
+        return set()
+    x_left, x_right = int(x_left), int(x_right)
+    y_top, y_bottom = int(y_top), int(y_bottom)
+    coords = np.meshgrid(np.arange(x_left, x_right), np.arange(y_top, y_bottom))
+    pixels = set(zip(coords[0].flat, coords[1].flat))
+    return pixels
+def calculate_coverage(box, other_boxes, penalize_double=False):
+    box_area = (box[2] - box[0]) * (box[3] - box[1])
+    if box_area == 0:
+        return 0
+    # find total coverage of the box
+    covered_pixels = set()
+    double_coverage = list()
+    for other_box in other_boxes:
+        ia = intersection_pixels(box, other_box)
+        double_coverage.append(list(covered_pixels.intersection(ia)))
+        covered_pixels = covered_pixels.union(ia)
+    # Penalize double coverage - having multiple bboxes overlapping the same pixels
+    double_coverage_penalty = len(double_coverage)
+    if not penalize_double:
+        double_coverage_penalty = 0
+    covered_pixels_count = max(0, len(covered_pixels) - double_coverage_penalty)
+    return covered_pixels_count / box_area
+def calculate_coverage_fast(box, other_boxes, penalize_double=False):
+    box_area = (box[2] - box[0]) * (box[3] - box[1])
+    if box_area == 0:
+        return 0
+    total_intersect = 0
+    for other_box in other_boxes:
+        total_intersect += intersection_area(box, other_box)
+    return min(1, total_intersect / box_area)
+def precision_recall(preds, references, threshold=.5, workers=8, penalize_double=True):
+    if len(references) == 0:
+        return {
+            "precision": 1,
+            "recall": 1,
+        }
+    if len(preds) == 0:
+        return {
+            "precision": 0,
+            "recall": 0,
+        }
+    # If we're not penalizing double coverage, we can use a faster calculation
+    coverage_func = calculate_coverage_fast
+    if penalize_double:
+        coverage_func = calculate_coverage
+    with ProcessPoolExecutor(max_workers=workers) as executor:
+        precision_func = partial(coverage_func, penalize_double=penalize_double)
+        precision_iou = executor.map(precision_func, preds, repeat(references))
+        reference_iou = executor.map(coverage_func, references, repeat(preds))
+    precision_classes = [1 if i > threshold else 0 for i in precision_iou]
+    precision = sum(precision_classes) / len(precision_classes)
+    recall_classes = [1 if i > threshold else 0 for i in reference_iou]
+    recall = sum(recall_classes) / len(recall_classes)
+    return {
+        "precision": precision,
+        "recall": recall,
+    }
+def mean_coverage(preds, references):
+    coverages = []
+    for box1 in references:
+        coverage = calculate_coverage(box1, preds)
+        coverages.append(coverage)
+    for box2 in preds:
+        coverage = calculate_coverage(box2, references)
+        coverages.append(coverage)
+    # Calculate the average coverage over all comparisons
+    if len(coverages) == 0:
+        return 0
+    coverage = sum(coverages) / len(coverages)
+    return {"coverage": coverage}
+def rank_accuracy(preds, references):
+    # Preds and references need to be aligned so each position refers to the same bbox
+    pairs = []
+    for i, pred in enumerate(preds):
+        for j, pred2 in enumerate(preds):
+            if i == j:
+                continue
+            pairs.append((i, j, pred > pred2))
+    # Find how many of the prediction rankings are correct
+    correct = 0
+    for i, ref in enumerate(references):
+        for j, ref2 in enumerate(references):
+            if (i, j, ref > ref2) in pairs:
+                correct += 1
+    return correct / len(pairs)

surya/benchmark/tesseract.py ADDED Viewed

	@@ -0,0 +1,179 @@

+from typing import List, Optional
+import numpy as np
+import pytesseract
+from pytesseract import Output
+from tqdm import tqdm
+from surya.input.processing import slice_bboxes_from_image
+from surya.settings import settings
+import os
+from concurrent.futures import ProcessPoolExecutor
+from surya.detection import get_batch_size as get_det_batch_size
+from surya.recognition import get_batch_size as get_rec_batch_size
+from surya.languages import CODE_TO_LANGUAGE
+def surya_lang_to_tesseract(code: str) -> Optional[str]:
+    lang_str = CODE_TO_LANGUAGE[code]
+    try:
+        tess_lang = TESS_LANGUAGE_TO_CODE[lang_str]
+    except KeyError:
+        return None
+    return tess_lang
+def tesseract_ocr(img, bboxes, lang: str):
+    line_imgs = slice_bboxes_from_image(img, bboxes)
+    config = f'--tessdata-dir "{settings.TESSDATA_PREFIX}"'
+    lines = []
+    for line_img in line_imgs:
+        line = pytesseract.image_to_string(line_img, lang=lang, config=config)
+        lines.append(line)
+    return lines
+def tesseract_ocr_parallel(imgs, bboxes, langs: List[str], cpus=None):
+    tess_parallel_cores = min(len(imgs), get_rec_batch_size())
+    if not cpus:
+        cpus = os.cpu_count()
+    tess_parallel_cores = min(tess_parallel_cores, cpus)
+    # Tesseract uses up to 4 processes per instance
+    # Divide by 2 because tesseract doesn't seem to saturate all 4 cores with these small images
+    tess_parallel = max(tess_parallel_cores // 2, 1)
+    with ProcessPoolExecutor(max_workers=tess_parallel) as executor:
+        tess_text = tqdm(executor.map(tesseract_ocr, imgs, bboxes, langs), total=len(imgs), desc="Running tesseract OCR")
+        tess_text = list(tess_text)
+    return tess_text
+def tesseract_bboxes(img):
+    arr_img = np.asarray(img, dtype=np.uint8)
+    ocr = pytesseract.image_to_data(arr_img, output_type=Output.DICT)
+    bboxes = []
+    n_boxes = len(ocr['level'])
+    for i in range(n_boxes):
+        # It is possible to merge by line here with line number, but it gives bad results.
+        _, x, y, w, h = ocr['text'][i], ocr['left'][i], ocr['top'][i], ocr['width'][i], ocr['height'][i]
+        bbox = (x, y, x + w, y + h)
+        bboxes.append(bbox)
+    return bboxes
+def tesseract_parallel(imgs):
+    # Tesseract uses 4 threads per instance
+    tess_parallel_cores = min(len(imgs), get_det_batch_size())
+    cpus = os.cpu_count()
+    tess_parallel_cores = min(tess_parallel_cores, cpus)
+    # Tesseract uses 4 threads per instance
+    tess_parallel = max(tess_parallel_cores // 4, 1)
+    with ProcessPoolExecutor(max_workers=tess_parallel) as executor:
+        tess_bboxes = tqdm(executor.map(tesseract_bboxes, imgs), total=len(imgs), desc="Running tesseract bbox detection")
+        tess_bboxes = list(tess_bboxes)
+    return tess_bboxes
+TESS_CODE_TO_LANGUAGE = {
+    "afr": "Afrikaans",
+    "amh": "Amharic",
+    "ara": "Arabic",
+    "asm": "Assamese",
+    "aze": "Azerbaijani",
+    "bel": "Belarusian",
+    "ben": "Bengali",
+    "bod": "Tibetan",
+    "bos": "Bosnian",
+    "bre": "Breton",
+    "bul": "Bulgarian",
+    "cat": "Catalan",
+    "ceb": "Cebuano",
+    "ces": "Czech",
+    "chi_sim": "Chinese",
+    "chr": "Cherokee",
+    "cym": "Welsh",
+    "dan": "Danish",
+    "deu": "German",
+    "dzo": "Dzongkha",
+    "ell": "Greek",
+    "eng": "English",
+    "epo": "Esperanto",
+    "est": "Estonian",
+    "eus": "Basque",
+    "fas": "Persian",
+    "fin": "Finnish",
+    "fra": "French",
+    "fry": "Western Frisian",
+    "guj": "Gujarati",
+    "gla": "Scottish Gaelic",
+    "gle": "Irish",
+    "glg": "Galician",
+    "heb": "Hebrew",
+    "hin": "Hindi",
+    "hrv": "Croatian",
+    "hun": "Hungarian",
+    "hye": "Armenian",
+    "iku": "Inuktitut",
+    "ind": "Indonesian",
+    "isl": "Icelandic",
+    "ita": "Italian",
+    "jav": "Javanese",
+    "jpn": "Japanese",
+    "kan": "Kannada",
+    "kat": "Georgian",
+    "kaz": "Kazakh",
+    "khm": "Khmer",
+    "kir": "Kyrgyz",
+    "kor": "Korean",
+    "lao": "Lao",
+    "lat": "Latin",
+    "lav": "Latvian",
+    "lit": "Lithuanian",
+    "mal": "Malayalam",
+    "mar": "Marathi",
+    "mkd": "Macedonian",
+    "mlt": "Maltese",
+    "mon": "Mongolian",
+    "msa": "Malay",
+    "mya": "Burmese",
+    "nep": "Nepali",
+    "nld": "Dutch",
+    "nor": "Norwegian",
+    "ori": "Oriya",
+    "pan": "Punjabi",
+    "pol": "Polish",
+    "por": "Portuguese",
+    "pus": "Pashto",
+    "ron": "Romanian",
+    "rus": "Russian",
+    "san": "Sanskrit",
+    "sin": "Sinhala",
+    "slk": "Slovak",
+    "slv": "Slovenian",
+    "snd": "Sindhi",
+    "spa": "Spanish",
+    "sqi": "Albanian",
+    "srp": "Serbian",
+    "swa": "Swahili",
+    "swe": "Swedish",
+    "syr": "Syriac",
+    "tam": "Tamil",
+    "tel": "Telugu",
+    "tgk": "Tajik",
+    "tha": "Thai",
+    "tir": "Tigrinya",
+    "tur": "Turkish",
+    "uig": "Uyghur",
+    "ukr": "Ukrainian",
+    "urd": "Urdu",
+    "uzb": "Uzbek",
+    "vie": "Vietnamese",
+    "yid": "Yiddish"
+}
+TESS_LANGUAGE_TO_CODE = {v:k for k,v in TESS_CODE_TO_LANGUAGE.items()}

surya/benchmark/util.py ADDED Viewed

	@@ -0,0 +1,31 @@

+def merge_boxes(box1, box2):
+    return (min(box1[0], box2[0]), min(box1[1], box2[1]), max(box1[2], box2[2]), max(box1[3], box2[3]))
+def join_lines(bboxes, max_gap=5):
+    to_merge = {}
+    for i, box1 in bboxes:
+        for z, box2 in bboxes[i + 1:]:
+            j = i + z + 1
+            if box1 == box2:
+                continue
+            if box1[0] <= box2[0] and box1[2] >= box2[2]:
+                if abs(box1[1] - box2[3]) <= max_gap:
+                    if i not in to_merge:
+                        to_merge[i] = []
+                    to_merge[i].append(j)
+    merged_boxes = set()
+    merged = []
+    for i, box in bboxes:
+        if i in merged_boxes:
+            continue
+        if i in to_merge:
+            for j in to_merge[i]:
+                box = merge_boxes(box, bboxes[j][1])
+                merged_boxes.add(j)
+        merged.append(box)
+    return merged

surya/detection.py ADDED Viewed

	@@ -0,0 +1,139 @@

+from typing import List, Tuple
+import torch
+import numpy as np
+from PIL import Image
+from surya.model.detection.segformer import SegformerForRegressionMask
+from surya.postprocessing.heatmap import get_and_clean_boxes
+from surya.postprocessing.affinity import get_vertical_lines
+from surya.input.processing import prepare_image_detection, split_image, get_total_splits, convert_if_not_rgb
+from surya.schema import TextDetectionResult
+from surya.settings import settings
+from tqdm import tqdm
+from concurrent.futures import ProcessPoolExecutor
+import torch.nn.functional as F
+def get_batch_size():
+    batch_size = settings.DETECTOR_BATCH_SIZE
+    if batch_size is None:
+        batch_size = 6
+        if settings.TORCH_DEVICE_MODEL == "cuda":
+            batch_size = 24
+    return batch_size
+def batch_detection(images: List, model: SegformerForRegressionMask, processor, batch_size=None) -> Tuple[List[List[np.ndarray]], List[Tuple[int, int]]]:
+    assert all([isinstance(image, Image.Image) for image in images])
+    if batch_size is None:
+        batch_size = get_batch_size()
+    heatmap_count = model.config.num_labels
+    images = [image.convert("RGB") for image in images]  # also copies the images
+    orig_sizes = [image.size for image in images]
+    splits_per_image = [get_total_splits(size, processor) for size in orig_sizes]
+    batches = []
+    current_batch_size = 0
+    current_batch = []
+    for i in range(len(images)):
+        if current_batch_size + splits_per_image[i] > batch_size:
+            if len(current_batch) > 0:
+                batches.append(current_batch)
+            current_batch = []
+            current_batch_size = 0
+        current_batch.append(i)
+        current_batch_size += splits_per_image[i]
+    if len(current_batch) > 0:
+        batches.append(current_batch)
+    all_preds = []
+    for batch_idx in tqdm(range(len(batches)), desc="Detecting bboxes"):
+        batch_image_idxs = batches[batch_idx]
+        batch_images = convert_if_not_rgb([images[j] for j in batch_image_idxs])
+        split_index = []
+        split_heights = []
+        image_splits = []
+        for image_idx, image in enumerate(batch_images):
+            image_parts, split_height = split_image(image, processor)
+            image_splits.extend(image_parts)
+            split_index.extend([image_idx] * len(image_parts))
+            split_heights.extend(split_height)
+        image_splits = [prepare_image_detection(image, processor) for image in image_splits]
+        # Batch images in dim 0
+        batch = torch.stack(image_splits, dim=0).to(model.dtype).to(model.device)
+        with torch.inference_mode():
+            pred = model(pixel_values=batch)
+        logits = pred.logits
+        correct_shape = [processor.size["height"], processor.size["width"]]
+        current_shape = list(logits.shape[2:])
+        if current_shape != correct_shape:
+            logits = F.interpolate(logits, size=correct_shape, mode='bilinear', align_corners=False)
+        logits = logits.cpu().detach().numpy().astype(np.float32)
+        preds = []
+        for i, (idx, height) in enumerate(zip(split_index, split_heights)):
+            # If our current prediction length is below the image idx, that means we have a new image
+            # Otherwise, we need to add to the current image
+            if len(preds) <= idx:
+                preds.append([logits[i][k] for k in range(heatmap_count)])
+            else:
+                heatmaps = preds[idx]
+                pred_heatmaps = [logits[i][k] for k in range(heatmap_count)]
+                if height < processor.size["height"]:
+                    # Cut off padding to get original height
+                    pred_heatmaps = [pred_heatmap[:height, :] for pred_heatmap in pred_heatmaps]
+                for k in range(heatmap_count):
+                    heatmaps[k] = np.vstack([heatmaps[k], pred_heatmaps[k]])
+                preds[idx] = heatmaps
+        all_preds.extend(preds)
+    assert len(all_preds) == len(images)
+    assert all([len(pred) == heatmap_count for pred in all_preds])
+    return all_preds, orig_sizes
+def parallel_get_lines(preds, orig_sizes):
+    heatmap, affinity_map = preds
+    heat_img = Image.fromarray((heatmap * 255).astype(np.uint8))
+    aff_img = Image.fromarray((affinity_map * 255).astype(np.uint8))
+    affinity_size = list(reversed(affinity_map.shape))
+    heatmap_size = list(reversed(heatmap.shape))
+    bboxes = get_and_clean_boxes(heatmap, heatmap_size, orig_sizes)
+    vertical_lines = get_vertical_lines(affinity_map, affinity_size, orig_sizes)
+    result = TextDetectionResult(
+        bboxes=bboxes,
+        vertical_lines=vertical_lines,
+        heatmap=heat_img,
+        affinity_map=aff_img,
+        image_bbox=[0, 0, orig_sizes[0], orig_sizes[1]]
+    )
+    return result
+def batch_text_detection(images: List, model, processor, batch_size=None) -> List[TextDetectionResult]:
+    preds, orig_sizes = batch_detection(images, model, processor, batch_size=batch_size)
+    results = []
+    if settings.IN_STREAMLIT or len(images) < settings.DETECTOR_MIN_PARALLEL_THRESH: # Ensures we don't parallelize with streamlit, or with very few images
+        for i in range(len(images)):
+            result = parallel_get_lines(preds[i], orig_sizes[i])
+            results.append(result)
+    else:
+        max_workers = min(settings.DETECTOR_POSTPROCESSING_CPU_WORKERS, len(images))
+        with ProcessPoolExecutor(max_workers=max_workers) as executor:
+            results = list(executor.map(parallel_get_lines, preds, orig_sizes))
+    return results

surya/input/__pycache__/processing.cpython-310.pyc ADDED Viewed

Binary file (4.05 kB). View file

surya/input/langs.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from typing import List
+from surya.languages import LANGUAGE_TO_CODE, CODE_TO_LANGUAGE
+def replace_lang_with_code(langs: List[str]):
+    for i in range(len(langs)):
+        if langs[i].title() in LANGUAGE_TO_CODE:
+            langs[i] = LANGUAGE_TO_CODE[langs[i].title()]
+        if langs[i] not in CODE_TO_LANGUAGE:
+            raise ValueError(f"Language code {langs[i]} not found.")
+def get_unique_langs(langs: List[List[str]]):
+    uniques = []
+    for lang_list in langs:
+        for lang in lang_list:
+            if lang not in uniques:
+                uniques.append(lang)
+    return uniques

surya/input/load.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import PIL
+from surya.input.processing import open_pdf, get_page_images
+import os
+import filetype
+from PIL import Image
+import json
+def get_name_from_path(path):
+    return os.path.basename(path).split(".")[0]
+def load_pdf(pdf_path, max_pages=None, start_page=None):
+    doc = open_pdf(pdf_path)
+    last_page = len(doc)
+    if start_page:
+        assert start_page < last_page and start_page >= 0, f"Start page must be between 0 and {last_page}"
+    else:
+        start_page = 0
+    if max_pages:
+        assert max_pages >= 0, f"Max pages must be greater than 0"
+        last_page = min(start_page + max_pages, last_page)
+    page_indices = list(range(start_page, last_page))
+    images = get_page_images(doc, page_indices)
+    doc.close()
+    names = [get_name_from_path(pdf_path) for _ in page_indices]
+    return images, names
+def load_image(image_path):
+    image = Image.open(image_path).convert("RGB")
+    name = get_name_from_path(image_path)
+    return [image], [name]
+def load_from_file(input_path, max_pages=None, start_page=None):
+    input_type = filetype.guess(input_path)
+    if input_type.extension == "pdf":
+        return load_pdf(input_path, max_pages, start_page)
+    else:
+        return load_image(input_path)
+def load_from_folder(folder_path, max_pages=None, start_page=None):
+    image_paths = [os.path.join(folder_path, image_name) for image_name in os.listdir(folder_path) if not image_name.startswith(".")]
+    image_paths = [ip for ip in image_paths if not os.path.isdir(ip)]
+    images = []
+    names = []
+    for path in image_paths:
+        extension = filetype.guess(path)
+        if extension and extension.extension == "pdf":
+            image, name = load_pdf(path, max_pages, start_page)
+            images.extend(image)
+            names.extend(name)
+        else:
+            try:
+                image, name = load_image(path)
+                images.extend(image)
+                names.extend(name)
+            except PIL.UnidentifiedImageError:
+                print(f"Could not load image {path}")
+                continue
+    return images, names
+def load_lang_file(lang_path, names):
+    with open(lang_path, "r") as f:
+        lang_dict = json.load(f)
+    return [lang_dict[name].copy() for name in names]

surya/input/processing.py ADDED Viewed

	@@ -0,0 +1,116 @@

+from typing import List
+import cv2
+import numpy as np
+import math
+import pypdfium2
+from PIL import Image, ImageOps, ImageDraw
+import torch
+from surya.settings import settings
+def convert_if_not_rgb(images: List[Image.Image]) -> List[Image.Image]:
+    new_images = []
+    for image in images:
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+        new_images.append(image)
+    return new_images
+def get_total_splits(image_size, processor):
+    img_height = list(image_size)[1]
+    max_height = settings.DETECTOR_IMAGE_CHUNK_HEIGHT
+    processor_height = processor.size["height"]
+    if img_height > max_height:
+        num_splits = math.ceil(img_height / processor_height)
+        return num_splits
+    return 1
+def split_image(img, processor):
+    # This will not modify/return the original image - it will either crop, or copy the image
+    img_height = list(img.size)[1]
+    max_height = settings.DETECTOR_IMAGE_CHUNK_HEIGHT
+    processor_height = processor.size["height"]
+    if img_height > max_height:
+        num_splits = math.ceil(img_height / processor_height)
+        splits = []
+        split_heights = []
+        for i in range(num_splits):
+            top = i * processor_height
+            bottom = (i + 1) * processor_height
+            if bottom > img_height:
+                bottom = img_height
+            cropped = img.crop((0, top, img.size[0], bottom))
+            height = bottom - top
+            if height < processor_height:
+                cropped = ImageOps.pad(cropped, (img.size[0], processor_height), color=255, centering=(0, 0))
+            splits.append(cropped)
+            split_heights.append(height)
+        return splits, split_heights
+    return [img.copy()], [img_height]
+def prepare_image_detection(img, processor):
+    new_size = (processor.size["width"], processor.size["height"])
+    # This double resize actually necessary for downstream accuracy
+    img.thumbnail(new_size, Image.Resampling.LANCZOS)
+    img = img.resize(new_size, Image.Resampling.LANCZOS) # Stretch smaller dimension to fit new size
+    img = np.asarray(img, dtype=np.uint8)
+    img = processor(img)["pixel_values"][0]
+    img = torch.from_numpy(img)
+    return img
+def open_pdf(pdf_filepath):
+    return pypdfium2.PdfDocument(pdf_filepath)
+def get_page_images(doc, indices: List, dpi=settings.IMAGE_DPI):
+    renderer = doc.render(
+        pypdfium2.PdfBitmap.to_pil,
+        page_indices=indices,
+        scale=dpi / 72,
+    )
+    images = list(renderer)
+    images = [image.convert("RGB") for image in images]
+    return images
+def slice_bboxes_from_image(image: Image.Image, bboxes):
+    lines = []
+    for bbox in bboxes:
+        line = image.crop((bbox[0], bbox[1], bbox[2], bbox[3]))
+        lines.append(line)
+    return lines
+def slice_polys_from_image(image: Image.Image, polys):
+    image_array = np.array(image, dtype=np.uint8)
+    lines = []
+    for idx, poly in enumerate(polys):
+        lines.append(slice_and_pad_poly(image_array, poly))
+    return lines
+def slice_and_pad_poly(image_array: np.array, coordinates):
+    # Draw polygon onto mask
+    coordinates = [(corner[0], corner[1]) for corner in coordinates]
+    bbox = [min([x[0] for x in coordinates]), min([x[1] for x in coordinates]), max([x[0] for x in coordinates]), max([x[1] for x in coordinates])]
+    # We mask out anything not in the polygon
+    cropped_polygon = image_array[bbox[1]:bbox[3], bbox[0]:bbox[2]].copy()
+    coordinates = [(x - bbox[0], y - bbox[1]) for x, y in coordinates]
+    # Pad the area outside the polygon with the pad value
+    mask = np.zeros(cropped_polygon.shape[:2], dtype=np.uint8)
+    cv2.fillPoly(mask, [np.int32(coordinates)], 1)
+    mask = np.stack([mask] * 3, axis=-1)
+    cropped_polygon[mask == 0] = settings.RECOGNITION_PAD_VALUE
+    rectangle_image = Image.fromarray(cropped_polygon)
+    return rectangle_image

surya/languages.py ADDED Viewed

	@@ -0,0 +1,101 @@

+CODE_TO_LANGUAGE = {
+    'af': 'Afrikaans',
+    'am': 'Amharic',
+    'ar': 'Arabic',
+    'as': 'Assamese',
+    'az': 'Azerbaijani',
+    'be': 'Belarusian',
+    'bg': 'Bulgarian',
+    'bn': 'Bengali',
+    'br': 'Breton',
+    'bs': 'Bosnian',
+    'ca': 'Catalan',
+    'cs': 'Czech',
+    'cy': 'Welsh',
+    'da': 'Danish',
+    'de': 'German',
+    'el': 'Greek',
+    'en': 'English',
+    'eo': 'Esperanto',
+    'es': 'Spanish',
+    'et': 'Estonian',
+    'eu': 'Basque',
+    'fa': 'Persian',
+    'fi': 'Finnish',
+    'fr': 'French',
+    'fy': 'Western Frisian',
+    'ga': 'Irish',
+    'gd': 'Scottish Gaelic',
+    'gl': 'Galician',
+    'gu': 'Gujarati',
+    'ha': 'Hausa',
+    'he': 'Hebrew',
+    'hi': 'Hindi',
+    'hr': 'Croatian',
+    'hu': 'Hungarian',
+    'hy': 'Armenian',
+    'id': 'Indonesian',
+    'is': 'Icelandic',
+    'it': 'Italian',
+    'ja': 'Japanese',
+    'jv': 'Javanese',
+    'ka': 'Georgian',
+    'kk': 'Kazakh',
+    'km': 'Khmer',
+    'kn': 'Kannada',
+    'ko': 'Korean',
+    'ku': 'Kurdish',
+    'ky': 'Kyrgyz',
+    'la': 'Latin',
+    'lo': 'Lao',
+    'lt': 'Lithuanian',
+    'lv': 'Latvian',
+    'mg': 'Malagasy',
+    'mk': 'Macedonian',
+    'ml': 'Malayalam',
+    'mn': 'Mongolian',
+    'mr': 'Marathi',
+    'ms': 'Malay',
+    'my': 'Burmese',
+    'ne': 'Nepali',
+    'nl': 'Dutch',
+    'no': 'Norwegian',
+    'om': 'Oromo',
+    'or': 'Oriya',
+    'pa': 'Punjabi',
+    'pl': 'Polish',
+    'ps': 'Pashto',
+    'pt': 'Portuguese',
+    'ro': 'Romanian',
+    'ru': 'Russian',
+    'sa': 'Sanskrit',
+    'sd': 'Sindhi',
+    'si': 'Sinhala',
+    'sk': 'Slovak',
+    'sl': 'Slovenian',
+    'so': 'Somali',
+    'sq': 'Albanian',
+    'sr': 'Serbian',
+    'su': 'Sundanese',
+    'sv': 'Swedish',
+    'sw': 'Swahili',
+    'ta': 'Tamil',
+    'te': 'Telugu',
+    'th': 'Thai',
+    'tl': 'Tagalog',
+    'tr': 'Turkish',
+    'ug': 'Uyghur',
+    'uk': 'Ukrainian',
+    'ur': 'Urdu',
+    'uz': 'Uzbek',
+    'vi': 'Vietnamese',
+    'xh': 'Xhosa',
+    'yi': 'Yiddish',
+    'zh': 'Chinese',
+}
+LANGUAGE_TO_CODE = {v: k for k, v in CODE_TO_LANGUAGE.items()}
+def is_arabic(lang_code):
+    return lang_code in ["ar", "fa", "ps", "ug", "ur"]

surya/layout.py ADDED Viewed

	@@ -0,0 +1,204 @@

+from collections import defaultdict
+from concurrent.futures import ProcessPoolExecutor
+from typing import List, Optional
+from PIL import Image
+import numpy as np
+from surya.detection import batch_detection
+from surya.postprocessing.heatmap import keep_largest_boxes, get_and_clean_boxes, get_detected_boxes
+from surya.schema import LayoutResult, LayoutBox, TextDetectionResult
+from surya.settings import settings
+def get_regions_from_detection_result(detection_result: TextDetectionResult, heatmaps: List[np.ndarray], orig_size, id2label, segment_assignment, vertical_line_width=20) -> List[LayoutBox]:
+    logits = np.stack(heatmaps, axis=0)
+    vertical_line_bboxes = [line for line in detection_result.vertical_lines]
+    line_bboxes = detection_result.bboxes
+    # Scale back to processor size
+    for line in vertical_line_bboxes:
+        line.rescale_bbox(orig_size, list(reversed(heatmaps[0].shape)))
+    for line in line_bboxes:
+        line.rescale(orig_size, list(reversed(heatmaps[0].shape)))
+    for bbox in vertical_line_bboxes:
+        # Give some width to the vertical lines
+        vert_bbox = list(bbox.bbox)
+        vert_bbox[2] = min(heatmaps[0].shape[0], vert_bbox[2] + vertical_line_width)
+        logits[:, vert_bbox[1]:vert_bbox[3], vert_bbox[0]:vert_bbox[2]] = 0  # zero out where the column lines are
+    logits[:, logits[0] >= .5] = 0 # zero out where blanks are
+    # Zero out where other segments are
+    for i in range(logits.shape[0]):
+        logits[i, segment_assignment != i] = 0
+    detected_boxes = []
+    for heatmap_idx in range(1, len(id2label)):  # Skip the blank class
+        heatmap = logits[heatmap_idx]
+        bboxes = get_detected_boxes(heatmap)
+        bboxes = [bbox for bbox in bboxes if bbox.area > 25]
+        for bb in bboxes:
+            bb.fit_to_bounds([0, 0, heatmap.shape[1] - 1, heatmap.shape[0] - 1])
+        for bbox in bboxes:
+            detected_boxes.append(LayoutBox(polygon=bbox.polygon, label=id2label[heatmap_idx], confidence=1))
+    detected_boxes = sorted(detected_boxes, key=lambda x: x.confidence, reverse=True)
+    # Expand bbox to cover intersecting lines
+    box_lines = defaultdict(list)
+    used_lines = set()
+    # We try 2 rounds of identifying the correct lines to snap to
+    # First round is majority intersection, second lowers the threshold
+    for thresh in [.5, .4]:
+        for bbox_idx, bbox in enumerate(detected_boxes):
+            for line_idx, line_bbox in enumerate(line_bboxes):
+                if line_bbox.intersection_pct(bbox) > thresh and line_idx not in used_lines:
+                    box_lines[bbox_idx].append(line_bbox.bbox)
+                    used_lines.add(line_idx)
+    new_boxes = []
+    for bbox_idx, bbox in enumerate(detected_boxes):
+        if bbox.label == "Picture" and bbox.area < 200: # Remove very small figures
+            continue
+        # Skip if we didn't find any lines to snap to, except for Pictures and Formulas
+        if bbox_idx not in box_lines and bbox.label not in ["Picture", "Formula"]:
+            continue
+        covered_lines = box_lines[bbox_idx]
+        # Snap non-picture layout boxes to correct text boundaries
+        if len(covered_lines) > 0 and bbox.label not in ["Picture"]:
+            min_x = min([line[0] for line in covered_lines])
+            min_y = min([line[1] for line in covered_lines])
+            max_x = max([line[2] for line in covered_lines])
+            max_y = max([line[3] for line in covered_lines])
+            # Tables and formulas can contain text, but text isn't the whole area
+            if bbox.label in ["Table", "Formula"]:
+                min_x_box = min([b[0] for b in bbox.polygon])
+                min_y_box = min([b[1] for b in bbox.polygon])
+                max_x_box = max([b[0] for b in bbox.polygon])
+                max_y_box = max([b[1] for b in bbox.polygon])
+                min_x = min(min_x, min_x_box)
+                min_y = min(min_y, min_y_box)
+                max_x = max(max_x, max_x_box)
+                max_y = max(max_y, max_y_box)
+            bbox.polygon[0][0] = min_x
+            bbox.polygon[0][1] = min_y
+            bbox.polygon[1][0] = max_x
+            bbox.polygon[1][1] = min_y
+            bbox.polygon[2][0] = max_x
+            bbox.polygon[2][1] = max_y
+            bbox.polygon[3][0] = min_x
+            bbox.polygon[3][1] = max_y
+        if bbox_idx in box_lines and bbox.label in ["Picture"]:
+            bbox.label = "Figure"
+        new_boxes.append(bbox)
+    # Merge tables together (sometimes one column is detected as a separate table)
+    for i in range(5): # Up to 5 rounds of merging
+        to_remove = set()
+        for bbox_idx, bbox in enumerate(new_boxes):
+            if bbox.label != "Table" or bbox_idx in to_remove:
+                continue
+            for bbox_idx2, bbox2 in enumerate(new_boxes):
+                if bbox2.label != "Table" or bbox_idx2 in to_remove or bbox_idx == bbox_idx2:
+                    continue
+                if bbox.intersection_pct(bbox2) > 0:
+                    bbox.merge(bbox2)
+                    to_remove.add(bbox_idx2)
+        new_boxes = [bbox for idx, bbox in enumerate(new_boxes) if idx not in to_remove]
+    # Ensure we account for all text lines in the layout
+    unused_lines = [line for idx, line in enumerate(line_bboxes) if idx not in used_lines]
+    for bbox in unused_lines:
+        new_boxes.append(LayoutBox(polygon=bbox.polygon, label="Text", confidence=.5))
+    for bbox in new_boxes:
+        bbox.rescale(list(reversed(heatmaps[0].shape)), orig_size)
+    detected_boxes = [bbox for bbox in new_boxes if bbox.area > 16]
+    # Remove bboxes contained inside others, unless they're captions
+    contained_bbox = []
+    for i, bbox in enumerate(detected_boxes):
+        for j, bbox2 in enumerate(detected_boxes):
+            if i == j:
+                continue
+            if bbox2.intersection_pct(bbox) >= .95 and bbox2.label not in ["Caption"]:
+                contained_bbox.append(j)
+    detected_boxes = [bbox for idx, bbox in enumerate(detected_boxes) if idx not in contained_bbox]
+    return detected_boxes
+def get_regions(heatmaps: List[np.ndarray], orig_size, id2label, segment_assignment) -> List[LayoutBox]:
+    bboxes = []
+    for i in range(1, len(id2label)):  # Skip the blank class
+        heatmap = heatmaps[i]
+        assert heatmap.shape == segment_assignment.shape
+        heatmap[segment_assignment != i] = 0  # zero out where another segment is
+        bbox = get_and_clean_boxes(heatmap, list(reversed(heatmap.shape)), orig_size)
+        for bb in bbox:
+            bboxes.append(LayoutBox(polygon=bb.polygon, label=id2label[i]))
+        heatmaps.append(heatmap)
+    bboxes = keep_largest_boxes(bboxes)
+    return bboxes
+def parallel_get_regions(heatmaps: List[np.ndarray], orig_size, id2label, detection_results=None) -> LayoutResult:
+    logits = np.stack(heatmaps, axis=0)
+    segment_assignment = logits.argmax(axis=0)
+    if detection_results is not None:
+        bboxes = get_regions_from_detection_result(detection_results, heatmaps, orig_size, id2label,
+                                                   segment_assignment)
+    else:
+        bboxes = get_regions(heatmaps, orig_size, id2label, segment_assignment)
+    segmentation_img = Image.fromarray(segment_assignment.astype(np.uint8))
+    result = LayoutResult(
+        bboxes=bboxes,
+        segmentation_map=segmentation_img,
+        heatmaps=heatmaps,
+        image_bbox=[0, 0, orig_size[0], orig_size[1]]
+    )
+    return result
+def batch_layout_detection(images: List, model, processor, detection_results: Optional[List[TextDetectionResult]] = None, batch_size=None) -> List[LayoutResult]:
+    preds, orig_sizes = batch_detection(images, model, processor, batch_size=batch_size)
+    id2label = model.config.id2label
+    results = []
+    if settings.IN_STREAMLIT or len(images) < settings.DETECTOR_MIN_PARALLEL_THRESH: # Ensures we don't parallelize with streamlit or too few images
+        for i in range(len(images)):
+            result = parallel_get_regions(preds[i], orig_sizes[i], id2label, detection_results[i] if detection_results else None)
+            results.append(result)
+    else:
+        futures = []
+        max_workers = min(settings.DETECTOR_POSTPROCESSING_CPU_WORKERS, len(images))
+        with ProcessPoolExecutor(max_workers=max_workers) as executor:
+            for i in range(len(images)):
+                future = executor.submit(parallel_get_regions, preds[i], orig_sizes[i], id2label, detection_results[i] if detection_results else None)
+                futures.append(future)
+            for future in futures:
+                results.append(future.result())
+    return results

surya/model/detection/__pycache__/processor.cpython-310.pyc ADDED Viewed

Binary file (11.6 kB). View file

surya/model/detection/__pycache__/segformer.cpython-310.pyc ADDED Viewed

Binary file (14.5 kB). View file

surya/model/detection/processor.py ADDED Viewed

	@@ -0,0 +1,284 @@

+import warnings
+from typing import Any, Dict, List, Optional, Union
+import numpy as np
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from transformers.image_transforms import to_channel_dimension_format
+from transformers.image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    make_list_of_images,
+)
+from transformers.utils import TensorType
+import PIL.Image
+import torch
+class SegformerImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Segformer image processor.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
+            size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"height": 512, "width": 512}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_reduce_labels (`bool`, *optional*, defaults to `False`):
+            Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 is
+            used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). The
+            background label will be replaced by 255. Can be overridden by the `do_reduce_labels` parameter in the
+            `preprocess` method.
+    """
+    model_input_names = ["pixel_values"]
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_reduce_labels: bool = False,
+        **kwargs,
+    ) -> None:
+        if "reduce_labels" in kwargs:
+            warnings.warn(
+                "The `reduce_labels` parameter is deprecated and will be removed in a future version. Please use "
+                "`do_reduce_labels` instead.",
+                FutureWarning,
+            )
+            do_reduce_labels = kwargs.pop("reduce_labels")
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 512, "width": 512}
+        size = get_size_dict(size)
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self.do_reduce_labels = do_reduce_labels
+        self._valid_processor_keys = [
+            "images",
+            "segmentation_maps",
+            "do_resize",
+            "size",
+            "resample",
+            "do_rescale",
+            "rescale_factor",
+            "do_normalize",
+            "image_mean",
+            "image_std",
+            "do_reduce_labels",
+            "return_tensors",
+            "data_format",
+            "input_data_format",
+        ]
+    @classmethod
+    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
+        """
+        Overrides the `from_dict` method from the base class to make sure `do_reduce_labels` is updated if image
+        processor is created using from_dict and kwargs e.g. `SegformerImageProcessor.from_pretrained(checkpoint,
+        reduce_labels=True)`
+        """
+        image_processor_dict = image_processor_dict.copy()
+        if "reduce_labels" in kwargs:
+            image_processor_dict["reduce_labels"] = kwargs.pop("reduce_labels")
+        return super().from_dict(image_processor_dict, **kwargs)
+    def _preprocess(
+        self,
+        image: ImageInput,
+        do_resize: bool,
+        do_rescale: bool,
+        do_normalize: bool,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        rescale_factor: Optional[float] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        if do_rescale:
+            image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+        if do_normalize:
+            image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+        return image
+    def _preprocess_image(
+        self,
+        image: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """Preprocesses a single image."""
+        # All transformations expect numpy arrays.
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
+        image = self._preprocess(
+            image=image,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            input_data_format=input_data_format,
+        )
+        if data_format is not None:
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+        return image
+    def __call__(self, images, segmentation_maps=None, **kwargs):
+        """
+        Preprocesses a batch of images and optionally segmentation maps.
+        Overrides the `__call__` method of the `Preprocessor` class so that both images and segmentation maps can be
+        passed in as positional arguments.
+        """
+        return super().__call__(images, segmentation_maps=segmentation_maps, **kwargs)
+    def preprocess(
+        self,
+        images: ImageInput,
+        segmentation_maps: Optional[ImageInput] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_reduce_labels: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            segmentation_maps (`ImageInput`, *optional*):
+                Segmentation map to preprocess.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after `resize` is applied.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation.
+            do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
+                Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
+                is used for background, and background itself is not included in all classes of a dataset (e.g.
+                ADE20k). The background label will be replaced by 255.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        resample = resample if resample is not None else self.resample
+        size = size if size is not None else self.size
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        images = make_list_of_images(images)
+        images = [
+            self._preprocess_image(
+                image=img,
+                do_resize=do_resize,
+                resample=resample,
+                size=size,
+                do_rescale=do_rescale,
+                rescale_factor=rescale_factor,
+                do_normalize=do_normalize,
+                image_mean=image_mean,
+                image_std=image_std,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            for img in images
+        ]
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)

surya/model/detection/segformer.py ADDED Viewed

	@@ -0,0 +1,468 @@

+import gc
+import warnings
+from transformers.activations import ACT2FN
+from transformers.pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+warnings.filterwarnings("ignore", message="torch.utils._pytree._register_pytree_node is deprecated")
+import math
+from typing import Optional, Tuple, Union
+from transformers import SegformerConfig, SegformerForSemanticSegmentation, SegformerDecodeHead, \
+    SegformerPreTrainedModel
+from surya.model.detection.processor import SegformerImageProcessor
+import torch
+from torch import nn
+from transformers.modeling_outputs import SemanticSegmenterOutput, BaseModelOutput
+from surya.settings import settings
+def load_model(checkpoint=settings.DETECTOR_MODEL_CHECKPOINT, device=settings.TORCH_DEVICE_DETECTION, dtype=settings.MODEL_DTYPE_DETECTION):
+    config = SegformerConfig.from_pretrained(checkpoint)
+    model = SegformerForRegressionMask.from_pretrained(checkpoint, torch_dtype=dtype, config=config)
+    if "mps" in device:
+        print("Warning: MPS may have poor results. This is a bug with MPS, see here - https://github.com/pytorch/pytorch/issues/84936")
+    model = model.to(device)
+    model = model.eval()
+    print(f"Loaded detection model {checkpoint} on device {device} with dtype {dtype}")
+    return model
+def load_processor(checkpoint=settings.DETECTOR_MODEL_CHECKPOINT):
+    processor = SegformerImageProcessor.from_pretrained(checkpoint)
+    return processor
+class SegformerForMaskMLP(nn.Module):
+    def __init__(self, config: SegformerConfig, input_dim, output_dim):
+        super().__init__()
+        self.proj = nn.Linear(input_dim, output_dim)
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        hidden_states = self.proj(hidden_states)
+        return hidden_states
+class SegformerForMaskDecodeHead(SegformerDecodeHead):
+    def __init__(self, config):
+        super().__init__(config)
+        decoder_layer_hidden_size = getattr(config, "decoder_layer_hidden_size", config.decoder_hidden_size)
+        # linear layers which will unify the channel dimension of each of the encoder blocks to the same config.decoder_hidden_size
+        mlps = []
+        for i in range(config.num_encoder_blocks):
+            mlp = SegformerForMaskMLP(config, input_dim=config.hidden_sizes[i], output_dim=decoder_layer_hidden_size)
+            mlps.append(mlp)
+        self.linear_c = nn.ModuleList(mlps)
+        # the following 3 layers implement the ConvModule of the original implementation
+        self.linear_fuse = nn.Conv2d(
+            in_channels=decoder_layer_hidden_size * config.num_encoder_blocks,
+            out_channels=config.decoder_hidden_size,
+            kernel_size=1,
+            bias=False,
+        )
+        self.batch_norm = nn.BatchNorm2d(config.decoder_hidden_size)
+        self.activation = nn.ReLU()
+        self.classifier = nn.Conv2d(config.decoder_hidden_size, config.num_labels, kernel_size=1)
+        self.config = config
+    def forward(self, encoder_hidden_states: torch.FloatTensor) -> torch.Tensor:
+        batch_size = encoder_hidden_states[-1].shape[0]
+        all_hidden_states = ()
+        for encoder_hidden_state, mlp in zip(encoder_hidden_states, self.linear_c):
+            if self.config.reshape_last_stage is False and encoder_hidden_state.ndim == 3:
+                height = width = int(math.sqrt(encoder_hidden_state.shape[-1]))
+                encoder_hidden_state = (
+                    encoder_hidden_state.reshape(batch_size, height, width, -1).permute(0, 3, 1, 2).contiguous()
+                )
+            # unify channel dimension
+            height, width = encoder_hidden_state.shape[2], encoder_hidden_state.shape[3]
+            encoder_hidden_state = mlp(encoder_hidden_state)
+            encoder_hidden_state = encoder_hidden_state.permute(0, 2, 1)
+            encoder_hidden_state = encoder_hidden_state.reshape(batch_size, -1, height, width)
+            # upsample
+            encoder_hidden_state = encoder_hidden_state.contiguous()
+            encoder_hidden_state = nn.functional.interpolate(
+                encoder_hidden_state, size=encoder_hidden_states[0].size()[2:], mode="bilinear", align_corners=False
+            )
+            all_hidden_states += (encoder_hidden_state,)
+        hidden_states = self.linear_fuse(torch.cat(all_hidden_states[::-1], dim=1))
+        hidden_states = self.batch_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        # logits are of shape (batch_size, num_labels, height/4, width/4)
+        logits = self.classifier(hidden_states)
+        return logits
+class SegformerOverlapPatchEmbeddings(nn.Module):
+    """Construct the overlapping patch embeddings."""
+    def __init__(self, patch_size, stride, num_channels, hidden_size):
+        super().__init__()
+        self.proj = nn.Conv2d(
+            num_channels,
+            hidden_size,
+            kernel_size=patch_size,
+            stride=stride,
+            padding=patch_size // 2,
+        )
+        self.layer_norm = nn.LayerNorm(hidden_size)
+    def forward(self, pixel_values):
+        embeddings = self.proj(pixel_values)
+        _, _, height, width = embeddings.shape
+        # (batch_size, num_channels, height, width) -> (batch_size, num_channels, height*width) -> (batch_size, height*width, num_channels)
+        # this can be fed to a Transformer layer
+        embeddings = embeddings.flatten(2).transpose(1, 2)
+        embeddings = self.layer_norm(embeddings)
+        return embeddings, height, width
+class SegformerEfficientSelfAttention(nn.Module):
+    """SegFormer's efficient self-attention mechanism. Employs the sequence reduction process introduced in the [PvT
+    paper](https://arxiv.org/abs/2102.12122)."""
+    def __init__(self, config, hidden_size, num_attention_heads, sequence_reduction_ratio):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        if self.hidden_size % self.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({self.num_attention_heads})"
+            )
+        self.attention_head_size = int(self.hidden_size / self.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(self.hidden_size, self.all_head_size)
+        self.key = nn.Linear(self.hidden_size, self.all_head_size)
+        self.value = nn.Linear(self.hidden_size, self.all_head_size)
+        self.sr_ratio = sequence_reduction_ratio
+        if sequence_reduction_ratio > 1:
+            self.sr = nn.Conv2d(
+                hidden_size, hidden_size, kernel_size=sequence_reduction_ratio, stride=sequence_reduction_ratio
+            )
+            self.layer_norm = nn.LayerNorm(hidden_size)
+    def transpose_for_scores(self, hidden_states):
+        new_shape = hidden_states.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        hidden_states = hidden_states.view(new_shape)
+        return hidden_states.permute(0, 2, 1, 3)
+    def forward(
+        self,
+        hidden_states,
+        height,
+        width,
+        output_attentions=False,
+    ):
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+        if self.sr_ratio > 1:
+            batch_size, seq_len, num_channels = hidden_states.shape
+            # Reshape to (batch_size, num_channels, height, width)
+            hidden_states = hidden_states.permute(0, 2, 1).reshape(batch_size, num_channels, height, width)
+            # Apply sequence reduction
+            hidden_states = self.sr(hidden_states)
+            # Reshape back to (batch_size, seq_len, num_channels)
+            hidden_states = hidden_states.reshape(batch_size, num_channels, -1).permute(0, 2, 1)
+            hidden_states = self.layer_norm(hidden_states)
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        return outputs
+class SegformerEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        # patch embeddings
+        embeddings = []
+        for i in range(config.num_encoder_blocks):
+            embeddings.append(
+                SegformerOverlapPatchEmbeddings(
+                    patch_size=config.patch_sizes[i],
+                    stride=config.strides[i],
+                    num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1],
+                    hidden_size=config.hidden_sizes[i],
+                )
+            )
+        self.patch_embeddings = nn.ModuleList(embeddings)
+        # Transformer blocks
+        blocks = []
+        cur = 0
+        for i in range(config.num_encoder_blocks):
+            # each block consists of layers
+            layers = []
+            if i != 0:
+                cur += config.depths[i - 1]
+            for j in range(config.depths[i]):
+                layers.append(
+                    SegformerLayer(
+                        config,
+                        hidden_size=config.hidden_sizes[i],
+                        num_attention_heads=config.num_attention_heads[i],
+                        sequence_reduction_ratio=config.sr_ratios[i],
+                        mlp_ratio=config.mlp_ratios[i],
+                    )
+                )
+            blocks.append(nn.ModuleList(layers))
+        self.block = nn.ModuleList(blocks)
+        # Layer norms
+        self.layer_norm = nn.ModuleList(
+            [nn.LayerNorm(config.hidden_sizes[i]) for i in range(config.num_encoder_blocks)]
+        )
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        batch_size = pixel_values.shape[0]
+        hidden_states = pixel_values
+        for idx, x in enumerate(zip(self.patch_embeddings, self.block, self.layer_norm)):
+            embedding_layer, block_layer, norm_layer = x
+            # first, obtain patch embeddings
+            hidden_states, height, width = embedding_layer(hidden_states)
+            # second, send embeddings through blocks
+            for i, blk in enumerate(block_layer):
+                layer_outputs = blk(hidden_states, height, width, output_attentions)
+                hidden_states = layer_outputs[0]
+            # third, apply layer norm
+            hidden_states = norm_layer(hidden_states)
+            # fourth, optionally reshape back to (batch_size, num_channels, height, width)
+            if idx != len(self.patch_embeddings) - 1 or (
+                idx == len(self.patch_embeddings) - 1 and self.config.reshape_last_stage
+            ):
+                hidden_states = hidden_states.reshape(batch_size, height, width, -1).permute(0, 3, 1, 2).contiguous()
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        return all_hidden_states
+class SegformerSelfOutput(nn.Module):
+    def __init__(self, config, hidden_size):
+        super().__init__()
+        self.dense = nn.Linear(hidden_size, hidden_size)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        return hidden_states
+class SegformerAttention(nn.Module):
+    def __init__(self, config, hidden_size, num_attention_heads, sequence_reduction_ratio):
+        super().__init__()
+        self.self = SegformerEfficientSelfAttention(
+            config=config,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            sequence_reduction_ratio=sequence_reduction_ratio,
+        )
+        self.output = SegformerSelfOutput(config, hidden_size=hidden_size)
+        self.pruned_heads = set()
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def forward(self, hidden_states, height, width, output_attentions=False):
+        self_outputs = self.self(hidden_states, height, width, output_attentions)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+class SegformerDWConv(nn.Module):
+    def __init__(self, dim=768):
+        super().__init__()
+        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
+    def forward(self, hidden_states, height, width):
+        batch_size, seq_len, num_channels = hidden_states.shape
+        hidden_states = hidden_states.transpose(1, 2).view(batch_size, num_channels, height, width)
+        hidden_states = self.dwconv(hidden_states)
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        return hidden_states
+class SegformerMixFFN(nn.Module):
+    def __init__(self, config, in_features, hidden_features=None, out_features=None):
+        super().__init__()
+        out_features = out_features or in_features
+        self.dense1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = SegformerDWConv(hidden_features)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+        self.dense2 = nn.Linear(hidden_features, out_features)
+    def forward(self, hidden_states, height, width):
+        hidden_states = self.dense1(hidden_states)
+        hidden_states = self.dwconv(hidden_states, height, width)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.dense2(hidden_states)
+        return hidden_states
+class SegformerLayer(nn.Module):
+    """This corresponds to the Block class in the original implementation."""
+    def __init__(self, config, hidden_size, num_attention_heads, sequence_reduction_ratio, mlp_ratio):
+        super().__init__()
+        self.layer_norm_1 = nn.LayerNorm(hidden_size)
+        self.attention = SegformerAttention(
+            config,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            sequence_reduction_ratio=sequence_reduction_ratio,
+        )
+        self.layer_norm_2 = nn.LayerNorm(hidden_size)
+        mlp_hidden_size = int(hidden_size * mlp_ratio)
+        self.mlp = SegformerMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size)
+    def forward(self, hidden_states, height, width, output_attentions=False):
+        self_attention_outputs = self.attention(
+            self.layer_norm_1(hidden_states),  # in Segformer, layernorm is applied before self-attention
+            height,
+            width,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+        # first residual connection (with stochastic depth)
+        hidden_states = attention_output + hidden_states
+        mlp_output = self.mlp(self.layer_norm_2(hidden_states), height, width)
+        # second residual connection (with stochastic depth)
+        layer_output = mlp_output + hidden_states
+        outputs = (layer_output,) + outputs
+        return outputs
+class SegformerModel(SegformerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        # hierarchical Transformer encoder
+        self.encoder = SegformerEncoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        encoder_outputs = self.encoder(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return encoder_outputs
+class SegformerForRegressionMask(SegformerForSemanticSegmentation):
+    def __init__(self, config, **kwargs):
+        super().__init__(config)
+        self.segformer = SegformerModel(config)
+        self.decode_head = SegformerForMaskDecodeHead(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        **kwargs
+    ) -> Union[Tuple, SemanticSegmenterOutput]:
+        encoder_hidden_states = self.segformer(
+            pixel_values,
+            output_attentions=False,
+            output_hidden_states=True,  # we need the intermediate hidden states
+            return_dict=False,
+        )
+        logits = self.decode_head(encoder_hidden_states)
+        # Apply sigmoid to get 0-1 output
+        sigmoid_logits = torch.special.expit(logits)
+        return SemanticSegmenterOutput(
+            loss=None,
+            logits=sigmoid_logits,
+            hidden_states=None,
+            attentions=None,
+        )

surya/model/ordering/config.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from transformers import MBartConfig, DonutSwinConfig
+class MBartOrderConfig(MBartConfig):
+    pass
+class VariableDonutSwinConfig(DonutSwinConfig):
+    pass

surya/model/ordering/decoder.py ADDED Viewed

	@@ -0,0 +1,557 @@

+import copy
+from typing import Optional, List, Union, Tuple
+from transformers import MBartForCausalLM, MBartConfig
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_attention_mask
+from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions, BaseModelOutputWithPastAndCrossAttentions
+from transformers.models.mbart.modeling_mbart import MBartPreTrainedModel, MBartDecoder, MBartLearnedPositionalEmbedding, MBartDecoderLayer
+from surya.model.ordering.config import MBartOrderConfig
+import torch
+import math
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    From llama
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class MBartGQAttention(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        num_kv_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        is_causal: bool = False,
+        config: Optional[MBartConfig] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.num_kv_groups = self.num_heads // self.num_kv_heads
+        assert self.num_heads % self.num_kv_heads == 0, f"num_heads ({self.num_heads}) must be divisible by num_kv_heads ({self.num_kv_heads})"
+        assert embed_dim % self.num_kv_heads == 0, f"embed_dim ({self.embed_dim}) must be divisible by num_kv_heads ({self.num_kv_heads})"
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.is_causal = is_causal
+        self.k_proj = nn.Linear(embed_dim, self.num_kv_heads * self.head_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, self.num_kv_heads * self.head_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def _shape_key_value(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_kv_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, _ = hidden_states.size()
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape_key_value(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape_key_value(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape_key_value(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape_key_value(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape_key_value(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape_key_value(self.v_proj(hidden_states), -1, bsz)
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        # Expand kv heads, then match query shape
+        key_states = repeat_kv(key_states, self.num_kv_groups)
+        value_states = repeat_kv(value_states, self.num_kv_groups)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.bmm(attn_probs, value_states)
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights_reshaped, past_key_value
+MBART_ATTENTION_CLASSES = {
+    "eager": MBartGQAttention,
+    "flash_attention_2": None
+}
+class MBartOrderDecoderLayer(MBartDecoderLayer):
+    def __init__(self, config: MBartConfig):
+        nn.Module.__init__(self)
+        self.embed_dim = config.d_model
+        self.self_attn = MBART_ATTENTION_CLASSES[config._attn_implementation](
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            num_kv_heads=config.kv_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            is_causal=True,
+            config=config,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = MBART_ATTENTION_CLASSES[config._attn_implementation](
+            self.embed_dim,
+            config.decoder_attention_heads,
+            num_kv_heads=config.kv_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            config=config,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+class BboxEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.x1_embed = nn.Embedding(config.max_width, config.d_model)
+        self.y1_embed = nn.Embedding(config.max_height, config.d_model)
+        self.x2_embed = nn.Embedding(config.max_width, config.d_model)
+        self.y2_embed = nn.Embedding(config.max_height, config.d_model)
+        self.w_embed = nn.Embedding(config.max_width, config.d_model)
+        self.h_embed = nn.Embedding(config.max_height, config.d_model)
+        self.cx_embed = nn.Embedding(config.max_width, config.d_model)
+        self.cy_embed = nn.Embedding(config.max_height, config.d_model)
+        self.box_pos_embed = nn.Embedding(config.max_position_embeddings, config.d_model)
+    def forward(self, boxes: torch.LongTensor, input_box_counts: torch.LongTensor, past_key_values_length: int):
+        x1, y1, x2, y2 = boxes.unbind(dim=-1)
+        # Shape is (batch_size, num_boxes/seq len, d_model)
+        w = x2 - x1
+        h = y2 - y1
+        # Center x and y in torch long tensors
+        cx = (x1 + x2) / 2
+        cy = (y1 + y2) / 2
+        cx = cx.long()
+        cy = cy.long()
+        coord_embeds = self.x1_embed(x1) + self.y1_embed(y1) + self.x2_embed(x2) + self.y2_embed(y2)
+        embedded = coord_embeds + self.w_embed(w) + self.h_embed(h) + self.cx_embed(cx) + self.cy_embed(cy)
+        # Add in positional embeddings for the boxes
+        if past_key_values_length == 0:
+            for j in range(embedded.shape[0]):
+                box_start = input_box_counts[j, 0]
+                box_end = input_box_counts[j, 1] - 1 # Skip the sep token
+                box_count = box_end - box_start
+                embedded[j, box_start:box_end] = embedded[j, box_start:box_end] + self.box_pos_embed.weight[:box_count]
+        return embedded
+class MBartOrderDecoder(MBartDecoder):
+    def __init__(self, config: MBartConfig, embed_tokens: Optional[nn.Embedding] = None):
+        MBartPreTrainedModel.__init__(self, config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        self.embed_tokens = BboxEmbedding(config) if embed_tokens is None else embed_tokens
+        if embed_tokens is not None:
+            self.embed_tokens.weight = embed_tokens.weight
+        self.embed_positions = MBartLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            config.d_model,
+        )
+        # Language-specific MoE goes at second and second-to-last layer
+        self.layers = nn.ModuleList([MBartOrderDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+        self.layer_norm = nn.LayerNorm(config.d_model)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_boxes: torch.LongTensor = None,
+        input_boxes_mask: Optional[torch.Tensor] = None,
+        input_boxes_counts: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_boxes is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_boxes is not None:
+            input = input_boxes
+            input_shape = input_boxes.size()[:-1] # Shape (batch_size, num_boxes)
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            input = inputs_embeds[:, :, -1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_boxes, input_boxes_counts, past_key_values_length) * self.embed_scale
+        if self._use_flash_attention_2:
+            # 2d mask is passed through the layers
+            attention_mask = input_boxes_mask if (input_boxes_mask is not None and 0 in input_boxes_mask) else None
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                input_boxes_mask, input_shape, inputs_embeds, past_key_values_length
+            )
+            if past_key_values_length == 0:
+                box_ends = input_boxes_counts[:, 1]
+                box_starts = input_boxes_counts[:, 0]
+                input_shape_arranged = torch.arange(input_shape[1], device=attention_mask.device)[None, :]
+                # Enable all boxes to attend to each other (before the sep token)
+                # Ensure that the boxes are not attending to the padding tokens
+                boxes_end_mask = input_shape_arranged < box_ends[:, None]
+                boxes_start_mask = input_shape_arranged >= box_starts[:, None]
+                boxes_mask = boxes_end_mask & boxes_start_mask
+                boxes_mask = boxes_mask.unsqueeze(1).unsqueeze(1) # Enable proper broadcasting
+                attention_mask = attention_mask.masked_fill(boxes_mask, 0)
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            if self._use_flash_attention_2:
+                encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None
+            else:
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                encoder_attention_mask = _prepare_4d_attention_mask(
+                    encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+                )
+        # embed positions
+        positions = self.embed_positions(input, past_key_values_length)
+        hidden_states = inputs_embeds + positions.to(inputs_embeds.device)
+        hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = () if use_cache else None
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != len(self.layers):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {attn_mask.size()[0]}."
+                    )
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    head_mask[idx] if head_mask is not None else None,
+                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
+                    None,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    cross_attn_layer_head_mask=(
+                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+                    ),
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+        hidden_states = self.layer_norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+class MBartOrderDecoderWrapper(MBartPreTrainedModel):
+    """
+    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
+    used in combination with the [`EncoderDecoderModel`] framework.
+    """
+    def __init__(self, config):
+        super().__init__(config)
+        self.decoder = MBartOrderDecoder(config)
+    def forward(self, *args, **kwargs):
+        return self.decoder(*args, **kwargs)
+class MBartOrder(MBartForCausalLM):
+    config_class = MBartOrderConfig
+    _tied_weights_keys = []
+    def __init__(self, config, **kwargs):
+        config = copy.deepcopy(config)
+        config.is_decoder = True
+        config.is_encoder_decoder = False
+        MBartPreTrainedModel.__init__(self, config)
+        self.model = MBartOrderDecoderWrapper(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_boxes: torch.LongTensor = None,
+        input_boxes_mask: Optional[torch.Tensor] = None,
+        input_boxes_counts: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model.decoder(
+            input_boxes=input_boxes,
+            input_boxes_mask=input_boxes_mask,
+            input_boxes_counts=input_boxes_counts,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = self.lm_head(outputs[0])
+        loss = None
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )

surya/model/ordering/encoder.py ADDED Viewed

	@@ -0,0 +1,83 @@

+from torch import nn
+import torch
+from typing import Optional, Tuple, Union
+import collections
+import math
+from transformers import DonutSwinPreTrainedModel
+from transformers.models.donut.modeling_donut_swin import DonutSwinPatchEmbeddings, DonutSwinEmbeddings, DonutSwinModel, \
+    DonutSwinEncoder
+from surya.model.ordering.config import VariableDonutSwinConfig
+class VariableDonutSwinEmbeddings(DonutSwinEmbeddings):
+    """
+    Construct the patch and position embeddings. Optionally, also the mask token.
+    """
+    def __init__(self, config, use_mask_token=False, **kwargs):
+        super().__init__(config, use_mask_token)
+        self.patch_embeddings = DonutSwinPatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.patch_grid = self.patch_embeddings.grid_size
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.embed_dim)) if use_mask_token else None
+        self.position_embeddings = None
+        if config.use_absolute_embeddings:
+            self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.embed_dim))
+        self.row_embeddings = None
+        self.column_embeddings = None
+        if config.use_2d_embeddings:
+            self.row_embeddings = nn.Parameter(torch.zeros(1, self.patch_grid[0] + 1, config.embed_dim))
+            self.column_embeddings = nn.Parameter(torch.zeros(1, self.patch_grid[1] + 1, config.embed_dim))
+        self.norm = nn.LayerNorm(config.embed_dim)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(
+        self, pixel_values: Optional[torch.FloatTensor], bool_masked_pos: Optional[torch.BoolTensor] = None, **kwargs
+    ) -> Tuple[torch.Tensor]:
+        embeddings, output_dimensions = self.patch_embeddings(pixel_values)
+        # Layernorm across the last dimension (each patch is a single row)
+        embeddings = self.norm(embeddings)
+        batch_size, seq_len, embed_dim = embeddings.size()
+        if bool_masked_pos is not None:
+            mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
+            # replace the masked visual tokens by mask_tokens
+            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
+        if self.position_embeddings is not None:
+            embeddings = embeddings + self.position_embeddings[:, :seq_len, :]
+        if self.row_embeddings is not None and self.column_embeddings is not None:
+            # Repeat the x position embeddings across the y axis like 0, 1, 2, 3, 0, 1, 2, 3, ...
+            row_embeddings = self.row_embeddings[:, :output_dimensions[0], :].repeat_interleave(output_dimensions[1], dim=1)
+            column_embeddings = self.column_embeddings[:, :output_dimensions[1], :].repeat(1, output_dimensions[0], 1)
+            embeddings = embeddings + row_embeddings + column_embeddings
+        embeddings = self.dropout(embeddings)
+        return embeddings, output_dimensions
+class VariableDonutSwinModel(DonutSwinModel):
+    config_class = VariableDonutSwinConfig
+    def __init__(self, config, add_pooling_layer=True, use_mask_token=False, **kwargs):
+        super().__init__(config)
+        self.config = config
+        self.num_layers = len(config.depths)
+        self.num_features = int(config.embed_dim * 2 ** (self.num_layers - 1))
+        self.embeddings = VariableDonutSwinEmbeddings(config, use_mask_token=use_mask_token)
+        self.encoder = DonutSwinEncoder(config, self.embeddings.patch_grid)
+        self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None
+        # Initialize weights and apply final processing
+        self.post_init()

surya/model/ordering/encoderdecoder.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from typing import Optional, Union, Tuple, List
+import torch
+from transformers import VisionEncoderDecoderModel
+from transformers.modeling_outputs import Seq2SeqLMOutput, BaseModelOutput
+class OrderVisionEncoderDecoderModel(VisionEncoderDecoderModel):
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        decoder_input_boxes: torch.LongTensor = None,
+        # Shape (batch_size, num_boxes, 4), all coords scaled 0 - 1000, with 1001 as padding
+        decoder_input_boxes_mask: torch.LongTensor = None,  # Shape (batch_size, num_boxes), 0 if padding, 1 otherwise
+        decoder_input_boxes_counts: torch.LongTensor = None,  # Shape (batch_size), number of boxes in each image
+        encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[List[List[int]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")}
+        kwargs_decoder = {
+            argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
+        }
+        if encoder_outputs is None:
+            if pixel_values is None:
+                raise ValueError("You have to specify pixel_values")
+            encoder_outputs = self.encoder(
+                pixel_values=pixel_values,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                **kwargs_encoder,
+            )
+        elif isinstance(encoder_outputs, tuple):
+            encoder_outputs = BaseModelOutput(*encoder_outputs)
+        encoder_hidden_states = encoder_outputs[0]
+        # optionally project encoder_hidden_states
+        if (
+            self.encoder.config.hidden_size != self.decoder.config.hidden_size
+            and self.decoder.config.cross_attention_hidden_size is None
+        ):
+            encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states)
+        # else:
+        encoder_attention_mask = None
+        # Decode
+        decoder_outputs = self.decoder(
+            input_boxes=decoder_input_boxes,
+            input_boxes_mask=decoder_input_boxes_mask,
+            input_boxes_counts=decoder_input_boxes_counts,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+            return_dict=return_dict,
+            labels=labels,
+            **kwargs_decoder,
+        )
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+        return Seq2SeqLMOutput(
+            loss=decoder_outputs.loss,
+            logits=decoder_outputs.logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )

surya/model/ordering/model.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from transformers import DetrConfig, BeitConfig, DetrImageProcessor, VisionEncoderDecoderConfig, AutoModelForCausalLM, \
+    AutoModel
+from surya.model.ordering.config import MBartOrderConfig, VariableDonutSwinConfig
+from surya.model.ordering.decoder import MBartOrder
+from surya.model.ordering.encoder import VariableDonutSwinModel
+from surya.model.ordering.encoderdecoder import OrderVisionEncoderDecoderModel
+from surya.model.ordering.processor import OrderImageProcessor
+from surya.settings import settings
+def load_model(checkpoint=settings.ORDER_MODEL_CHECKPOINT, device=settings.TORCH_DEVICE_MODEL, dtype=settings.MODEL_DTYPE):
+    config = VisionEncoderDecoderConfig.from_pretrained(checkpoint)
+    decoder_config = vars(config.decoder)
+    decoder = MBartOrderConfig(**decoder_config)
+    config.decoder = decoder
+    encoder_config = vars(config.encoder)
+    encoder = VariableDonutSwinConfig(**encoder_config)
+    config.encoder = encoder
+    # Get transformers to load custom model
+    AutoModel.register(MBartOrderConfig, MBartOrder)
+    AutoModelForCausalLM.register(MBartOrderConfig, MBartOrder)
+    AutoModel.register(VariableDonutSwinConfig, VariableDonutSwinModel)
+    model = OrderVisionEncoderDecoderModel.from_pretrained(checkpoint, config=config, torch_dtype=dtype)
+    assert isinstance(model.decoder, MBartOrder)
+    assert isinstance(model.encoder, VariableDonutSwinModel)
+    model = model.to(device)
+    model = model.eval()
+    print(f"Loaded reading order model {checkpoint} on device {device} with dtype {dtype}")
+    return model

surya/model/ordering/processor.py ADDED Viewed

	@@ -0,0 +1,156 @@

+from copy import deepcopy
+from typing import Dict, Union, Optional, List, Tuple
+import torch
+from torch import TensorType
+from transformers import DonutImageProcessor, DonutProcessor
+from transformers.image_processing_utils import BatchFeature
+from transformers.image_utils import PILImageResampling, ImageInput, ChannelDimension, make_list_of_images, \
+    valid_images, to_numpy_array
+import numpy as np
+from PIL import Image
+import PIL
+from surya.settings import settings
+def load_processor(checkpoint=settings.ORDER_MODEL_CHECKPOINT):
+    processor = OrderImageProcessor.from_pretrained(checkpoint)
+    processor.size = settings.ORDER_IMAGE_SIZE
+    box_size = 1024
+    max_tokens = 256
+    processor.token_sep_id = max_tokens + box_size + 1
+    processor.token_pad_id = max_tokens + box_size + 2
+    processor.max_boxes = settings.ORDER_MAX_BOXES - 1
+    processor.box_size = {"height": box_size, "width": box_size}
+    return processor
+class OrderImageProcessor(DonutImageProcessor):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.patch_size = kwargs.get("patch_size", (4, 4))
+    def process_inner(self, images: List[np.ndarray]):
+        images = [img.transpose(2, 0, 1) for img in images] # convert to CHW format
+        assert images[0].shape[0] == 3 # RGB input images, channel dim last
+        # Convert to float32 for rescale/normalize
+        images = [img.astype(np.float32) for img in images]
+        # Rescale and normalize
+        images = [
+            self.rescale(img, scale=self.rescale_factor, input_data_format=ChannelDimension.FIRST)
+            for img in images
+        ]
+        images = [
+            self.normalize(img, mean=self.image_mean, std=self.image_std, input_data_format=ChannelDimension.FIRST)
+            for img in images
+        ]
+        return images
+    def process_boxes(self, boxes):
+        padded_boxes = []
+        box_masks = []
+        box_counts = []
+        for b in boxes:
+            # Left pad for generation
+            padded_b = deepcopy(b)
+            padded_b.append([self.token_sep_id] * 4) # Sep token to indicate start of label predictions
+            padded_boxes.append(padded_b)
+        max_boxes = max(len(b) for b in padded_boxes)
+        for i in range(len(padded_boxes)):
+            pad_len = max_boxes - len(padded_boxes[i])
+            box_len = len(padded_boxes[i])
+            box_mask = [0] * pad_len + [1] * box_len
+            padded_box = [[self.token_pad_id] * 4] * pad_len + padded_boxes[i]
+            padded_boxes[i] = padded_box
+            box_masks.append(box_mask)
+            box_counts.append([pad_len, max_boxes])
+        return padded_boxes, box_masks, box_counts
+    def resize_img_and_boxes(self, img, boxes):
+        orig_dim = img.size
+        new_size = (self.size["width"], self.size["height"])
+        img.thumbnail(new_size, Image.Resampling.LANCZOS)  # Shrink largest dimension to fit new size
+        img = img.resize(new_size, Image.Resampling.LANCZOS)  # Stretch smaller dimension to fit new size
+        img = np.asarray(img, dtype=np.uint8)
+        width, height = orig_dim
+        box_width, box_height = self.box_size["width"], self.box_size["height"]
+        for box in boxes:
+            # Rescale to 0-1024
+            box[0] = box[0] / width * box_width
+            box[1] = box[1] / height * box_height
+            box[2] = box[2] / width * box_width
+            box[3] = box[3] / height * box_height
+            if box[0] < 0:
+                box[0] = 0
+            if box[1] < 0:
+                box[1] = 0
+            if box[2] > box_width:
+                box[2] = box_width
+            if box[3] > box_height:
+                box[3] = box_height
+        return img, boxes
+    def preprocess(
+        self,
+        images: ImageInput,
+        boxes: List[List[int]],
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_thumbnail: bool = None,
+        do_align_long_axis: bool = None,
+        do_pad: bool = None,
+        random_padding: bool = False,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> PIL.Image.Image:
+        images = make_list_of_images(images)
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        new_images = []
+        new_boxes = []
+        for img, box in zip(images, boxes):
+            if len(box) > self.max_boxes:
+                raise ValueError(f"Too many boxes, max is {self.max_boxes}")
+            img, box = self.resize_img_and_boxes(img, box)
+            new_images.append(img)
+            new_boxes.append(box)
+        images = new_images
+        boxes = new_boxes
+        # Convert to numpy for later processing steps
+        images = [np.array(image) for image in images]
+        images = self.process_inner(images)
+        boxes, box_mask, box_counts = self.process_boxes(boxes)
+        data = {
+            "pixel_values": images,
+            "input_boxes": boxes,
+            "input_boxes_mask": box_mask,
+            "input_boxes_counts": box_counts,
+        }
+        return BatchFeature(data=data, tensor_type=return_tensors)

surya/model/recognition/__pycache__/config.cpython-310.pyc ADDED Viewed

Binary file (2.41 kB). View file

surya/model/recognition/__pycache__/decoder.cpython-310.pyc ADDED Viewed

Binary file (14.2 kB). View file

surya/model/recognition/__pycache__/encoder.cpython-310.pyc ADDED Viewed

Binary file (13.2 kB). View file

surya/model/recognition/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (2.56 kB). View file

surya/model/recognition/__pycache__/processor.cpython-310.pyc ADDED Viewed

Binary file (7.14 kB). View file

surya/model/recognition/__pycache__/tokenizer.cpython-310.pyc ADDED Viewed

Binary file (3.5 kB). View file

surya/model/recognition/config.py ADDED Viewed

	@@ -0,0 +1,111 @@

+from transformers import T5Config, MBartConfig, DonutSwinConfig
+class MBartMoEConfig(MBartConfig):
+    pass
+class VariableDonutSwinConfig(DonutSwinConfig):
+    pass
+# Config specific to the model, needed for the tokenizer
+TOTAL_TOKENS = 65536
+TOKEN_OFFSET = 3 # Pad, eos, bos
+SPECIAL_TOKENS = 253
+TOTAL_VOCAB_SIZE = TOTAL_TOKENS + TOKEN_OFFSET + SPECIAL_TOKENS
+LANGUAGE_MAP = {
+    'af': 0,
+    'am': 1,
+    'ar': 2,
+    'as': 3,
+    'az': 4,
+    'be': 5,
+    'bg': 6,
+    'bn': 7,
+    'br': 8,
+    'bs': 9,
+    'ca': 10,
+    'cs': 11,
+    'cy': 12,
+    'da': 13,
+    'de': 14,
+    'el': 15,
+    'en': 16,
+    'eo': 17,
+    'es': 18,
+    'et': 19,
+    'eu': 20,
+    'fa': 21,
+    'fi': 22,
+    'fr': 23,
+    'fy': 24,
+    'ga': 25,
+    'gd': 26,
+    'gl': 27,
+    'gu': 28,
+    'ha': 29,
+    'he': 30,
+    'hi': 31,
+    'hr': 32,
+    'hu': 33,
+    'hy': 34,
+    'id': 35,
+    'is': 36,
+    'it': 37,
+    'ja': 38,
+    'jv': 39,
+    'ka': 40,
+    'kk': 41,
+    'km': 42,
+    'kn': 43,
+    'ko': 44,
+    'ku': 45,
+    'ky': 46,
+    'la': 47,
+    'lo': 48,
+    'lt': 49,
+    'lv': 50,
+    'mg': 51,
+    'mk': 52,
+    'ml': 53,
+    'mn': 54,
+    'mr': 55,
+    'ms': 56,
+    'my': 57,
+    'ne': 58,
+    'nl': 59,
+    'no': 60,
+    'om': 61,
+    'or': 62,
+    'pa': 63,
+    'pl': 64,
+    'ps': 65,
+    'pt': 66,
+    'ro': 67,
+    'ru': 68,
+    'sa': 69,
+    'sd': 70,
+    'si': 71,
+    'sk': 72,
+    'sl': 73,
+    'so': 74,
+    'sq': 75,
+    'sr': 76,
+    'su': 77,
+    'sv': 78,
+    'sw': 79,
+    'ta': 80,
+    'te': 81,
+    'th': 82,
+    'tl': 83,
+    'tr': 84,
+    'ug': 85,
+    'uk': 86,
+    'ur': 87,
+    'uz': 88,
+    'vi': 89,
+    'xh': 90,
+    'yi': 91,
+    'zh': 92
+}

surya/model/recognition/decoder.py ADDED Viewed

	@@ -0,0 +1,511 @@

+import copy
+from typing import Optional, List, Union, Tuple
+from transformers import MBartForCausalLM, MBartConfig
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions, BaseModelOutputWithPastAndCrossAttentions
+from transformers.models.mbart.modeling_mbart import MBartPreTrainedModel, MBartDecoder
+from .config import MBartMoEConfig
+import torch
+import math
+class MBartLearnedPositionalEmbedding(nn.Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+    def __init__(self, num_embeddings: int, embedding_dim: int):
+        # MBart is set up so that if padding_idx is specified then offset the embedding ids by 2
+        # and adjust num_embeddings appropriately. Other models don't have this hack
+        self.offset = 2
+        super().__init__(num_embeddings + self.offset, embedding_dim)
+    def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
+        """`input_ids' shape is expected to be [bsz x seqlen]."""
+        bsz, seq_len = input_ids.shape[:2]
+        positions = torch.arange(
+            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
+        ).expand(bsz, -1)
+        return super().forward(positions + self.offset)
+class MBartExpertMLP(nn.Module):
+    def __init__(self, config: MBartConfig, is_lg=False, is_xl=False):
+        super().__init__()
+        self.ffn_dim = config.d_expert
+        if is_lg:
+            self.ffn_dim = config.d_expert_lg
+        if is_xl:
+            self.ffn_dim = config.d_expert_xl
+        self.hidden_dim = config.d_model
+        self.w1 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
+        self.w2 = nn.Linear(self.ffn_dim, self.hidden_dim, bias=False)
+        self.w3 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
+        self.dropout = nn.Dropout(config.activation_dropout)
+        self.act_fn = ACT2FN[config.activation_function]
+    def forward(self, hidden_states):
+        current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(hidden_states)
+        current_hidden_states = self.w2(current_hidden_states)
+        return current_hidden_states
+class MBartExpertLayer(nn.Module):
+    # From mixtral, with modifications
+    def __init__(self, config):
+        super().__init__()
+        self.dropout = nn.Dropout(config.activation_dropout)
+        self.hidden_dim = config.d_model
+        self.lg_lang_codes = sorted(config.lg_langs.values()) if hasattr(config, "lg_langs") else []
+        self.xl_lang_codes = sorted(config.xl_langs.values()) if hasattr(config, "xl_langs") else []
+        self.lang_codes = sorted(config.langs.values())
+        self.num_experts = len(self.lang_codes)
+        self.experts = nn.ModuleDict({str(lang): MBartExpertMLP(config, is_lg=(lang in self.lg_lang_codes), is_xl=(lang in self.xl_lang_codes)) for lang in self.lang_codes})
+    def forward(self, hidden_states: torch.Tensor, langs: torch.LongTensor) -> torch.Tensor:
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        final_hidden_states = torch.zeros(
+            (batch_size, sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
+        )
+        # Weight experts based on how many languages in the input
+        routing_weights = 1 / ((langs > 3).sum(axis=-1))
+        # Set weights to 1 if zero experts activated
+        routing_weights[torch.isinf(routing_weights)] = 1
+        unique_langs = langs.unique(dim=None, sorted=True)
+        unique_langs = unique_langs[unique_langs > 3] # Remove start token
+        # Loop over all available experts in the model and perform the computation on each expert
+        for expert_lang in unique_langs:
+            # Check which samples match with this expert
+            lang_match = (langs == expert_lang).any(dim=-1)
+            idx = torch.nonzero(lang_match, as_tuple=True)[0]
+            if idx.shape[0] == 0:
+                continue
+            expert_layer = self.experts[str(expert_lang.item())]
+            current_state = hidden_states[idx]
+            current_hidden_states = expert_layer(current_state.view(-1, hidden_dim))
+            current_hidden_states = current_hidden_states.view(-1, sequence_length, hidden_dim)
+            # Weight by number of languages in the input
+            selected_routing_weights = routing_weights[idx].view(-1, 1, 1)
+            current_hidden_states *= selected_routing_weights
+            final_hidden_states.index_add_(0, idx, current_hidden_states)
+        return final_hidden_states
+class MBartGQAttention(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        num_kv_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        is_causal: bool = False,
+        config: Optional[MBartConfig] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.num_kv_groups = self.num_heads // self.num_kv_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.is_causal = is_causal
+        self.k_proj = nn.Linear(embed_dim, self.num_kv_heads * self.head_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, self.num_kv_heads * self.head_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def _shape_key_value(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_kv_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        is_prefill: Optional[bool] = False,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, _ = hidden_states.size()
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if is_cross_attention:
+            if is_prefill:
+                # cross_attentions
+                key_states = self._shape_key_value(self.k_proj(key_value_states), -1, bsz)
+                value_states = self._shape_key_value(self.v_proj(key_value_states), -1, bsz)
+                past_key_value = torch.cat([key_states.unsqueeze(0), value_states.unsqueeze(0)], dim=0)
+            else:
+                # reuse k,v, cross_attentions
+                key_states = past_key_value[0]
+                value_states = past_key_value[1]
+                past_key_value = None
+        # Self-attention
+        else:
+            if is_prefill:
+                # initial prompt
+                key_states = self._shape_key_value(self.k_proj(hidden_states), -1, bsz)
+                value_states = self._shape_key_value(self.v_proj(hidden_states), -1, bsz)
+                past_key_value = torch.cat([key_states[:, :, -tgt_len:].unsqueeze(0), value_states[:, :, -tgt_len:].unsqueeze(0)], dim=0)
+            else:
+                # reuse k, v, self_attention
+                key_states = self._shape_key_value(self.k_proj(hidden_states), -1, bsz)
+                value_states = self._shape_key_value(self.v_proj(hidden_states), -1, bsz)
+                key_states = torch.cat([past_key_value[0], key_states], dim=2)
+                value_states = torch.cat([past_key_value[1], value_states], dim=2)
+                past_key_value = torch.cat([key_states[:, :, -tgt_len:].unsqueeze(0), value_states[:, :, -tgt_len:].unsqueeze(0)], dim=0)
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        # Expand kv heads, then match query shape
+        key_states = key_states.repeat_interleave(self.num_kv_groups, dim=1).reshape(*proj_shape)
+        value_states = value_states.repeat_interleave(self.num_kv_groups, dim=1).reshape(*proj_shape)
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+        if not is_cross_attention:
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        attn_output = torch.bmm(attn_weights, value_states).view(bsz, self.num_heads, tgt_len, self.head_dim).transpose(1,2)
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, past_key_value
+class MBartMoEDecoderLayer(nn.Module):
+    def __init__(self, config: MBartConfig, has_moe=False):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = MBartGQAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            num_kv_heads=config.kv_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            is_causal=True,
+            config=config,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = MBartGQAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            num_kv_heads=config.kv_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            config=config,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.has_moe = has_moe
+        if has_moe:
+            self.moe = MBartExpertLayer(config)
+        else:
+            self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+            self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        langs: Optional[torch.LongTensor] = None,
+        self_kv_cache: Optional[torch.Tensor] = None,
+        cross_kv_cache: Optional[torch.Tensor] = None,
+        is_prefill: Optional[bool] = False,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = True,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_kv_cache,
+            is_prefill=is_prefill,
+            attention_mask=attention_mask,
+        )
+        hidden_states = residual + hidden_states
+        # Cross-Attention Block
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            hidden_states, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                is_prefill=is_prefill,
+                attention_mask=encoder_attention_mask,
+                past_key_value=cross_kv_cache,
+            )
+            hidden_states = residual + hidden_states
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = (present_key_value, cross_attn_present_key_value)
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        if self.has_moe:
+            hidden_states = self.moe(hidden_states, langs)
+        else:
+            hidden_states = self.activation_fn(self.fc1(hidden_states))
+            hidden_states = self.fc2(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+class MBartMoEDecoder(MBartDecoder):
+    def __init__(self, config: MBartConfig, embed_tokens: Optional[nn.Embedding] = None):
+        MBartPreTrainedModel.__init__(self, config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
+        if embed_tokens is not None:
+            self.embed_tokens.weight = embed_tokens.weight
+        self.embed_positions = MBartLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            config.d_model,
+        )
+        # Language-specific MoE goes at second and second-to-last layer
+        self.layers = nn.ModuleList([MBartMoEDecoderLayer(config, has_moe=(i in config.moe_layers) and config.use_moe) for i in range(config.decoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+        self.layer_norm = nn.LayerNorm(config.d_model)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        self_kv_cache: Optional[torch.Tensor] = None,
+        cross_kv_cache: Optional[torch.Tensor] = None,
+        past_token_count: Optional[int] = None,
+        langs: Optional[torch.LongTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        use_cache = True
+        return_dict = True
+        input = input_ids
+        input_shape = input.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+        # past_key_values_length
+        past_key_values_length = past_token_count if self_kv_cache is not None else 0
+        inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+        # embed positions
+        positions = self.embed_positions(input, past_key_values_length)
+        hidden_states = inputs_embeds + positions
+        hidden_states = self.layernorm_embedding(hidden_states)
+        # decoder layers
+        all_hidden_states = None
+        all_self_attns = None
+        all_cross_attentions = None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            is_prefill = past_token_count == 0
+            layer_self_kv_cache = self_kv_cache[idx] if self_kv_cache is not None else None
+            layer_cross_kv_cache = cross_kv_cache[idx] if cross_kv_cache is not None else None
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                langs=langs,
+                self_kv_cache=layer_self_kv_cache,
+                cross_kv_cache=layer_cross_kv_cache,
+                is_prefill=is_prefill,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=None,
+                use_cache=use_cache,
+            )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[1],)
+        hidden_states = self.layer_norm(hidden_states)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+class MBartMoEDecoderWrapper(MBartPreTrainedModel):
+    """
+    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
+    used in combination with the [`EncoderDecoderModel`] framework.
+    """
+    def __init__(self, config):
+        super().__init__(config)
+        self.decoder = MBartMoEDecoder(config)
+    def forward(self, *args, **kwargs):
+        return self.decoder(*args, **kwargs)
+class MBartMoE(MBartForCausalLM):
+    config_class = MBartMoEConfig
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config, **kwargs):
+        config = copy.deepcopy(config)
+        config.is_decoder = True
+        config.is_encoder_decoder = False
+        MBartPreTrainedModel.__init__(self, config)
+        self.model = MBartMoEDecoderWrapper(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        self_kv_cache: Optional[torch.FloatTensor] = None,
+        cross_kv_cache: Optional[torch.FloatTensor] = None,
+        past_token_count: Optional[int] = None,
+        langs: Optional[torch.LongTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            self_kv_cache=self_kv_cache,
+            cross_kv_cache=cross_kv_cache,
+            past_token_count=past_token_count,
+            langs=langs,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        logits = self.lm_head(outputs[0])
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return output
+        return CausalLMOutputWithCrossAttentions(
+            loss=None,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+    def prune_moe_experts(self, keep_keys: List[int]):
+        # Remove experts not specified in keep_keys
+        str_keep_keys = [str(key) for key in keep_keys]
+        for layer in self.model.decoder.layers:
+            if not layer.has_moe:
+                continue
+            lang_keys = list(layer.moe.experts.keys())
+            for lang in lang_keys:
+                if lang not in str_keep_keys:
+                    layer.moe.experts.pop(lang)
+            layer.lang_codes = keep_keys

surya/model/recognition/encoder.py ADDED Viewed

	@@ -0,0 +1,469 @@

+from torch import nn
+import torch
+from typing import Optional, Tuple, Union
+from transformers.models.donut.modeling_donut_swin import DonutSwinPatchEmbeddings, DonutSwinEmbeddings, DonutSwinModel, \
+    DonutSwinEncoder, DonutSwinModelOutput, DonutSwinEncoderOutput, DonutSwinAttention, DonutSwinDropPath, \
+    DonutSwinIntermediate, DonutSwinOutput, window_partition, window_reverse
+# from config import VariableDonutSwinConfig
+from .config import VariableDonutSwinConfig
+class VariableDonutSwinEmbeddings(DonutSwinEmbeddings):
+    """
+    Construct the patch and position embeddings. Optionally, also the mask token.
+    """
+    def __init__(self, config, use_mask_token=False):
+        super().__init__(config, use_mask_token)
+        self.patch_embeddings = DonutSwinPatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.patch_grid = self.patch_embeddings.grid_size
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.embed_dim)) if use_mask_token else None
+        self.position_embeddings = None
+        if config.use_absolute_embeddings:
+            self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.embed_dim))
+        self.norm = nn.LayerNorm(config.embed_dim)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(
+        self, pixel_values: Optional[torch.FloatTensor], bool_masked_pos: Optional[torch.BoolTensor] = None
+    ) -> Tuple[torch.Tensor]:
+        embeddings, output_dimensions = self.patch_embeddings(pixel_values)
+        # Layernorm across the last dimension (each patch is a single row)
+        embeddings = self.norm(embeddings)
+        batch_size, seq_len, embed_dim = embeddings.size()
+        if bool_masked_pos is not None:
+            mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
+            # replace the masked visual tokens by mask_tokens
+            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
+        if self.position_embeddings is not None:
+            embeddings = embeddings + self.position_embeddings[:, :seq_len, :]
+        embeddings = self.dropout(embeddings)
+        return embeddings, output_dimensions
+class VariableDonutSwinPatchMerging(nn.Module):
+    """
+    Patch Merging Layer.
+    Args:
+        input_resolution (`Tuple[int]`):
+            Resolution of input feature.
+        dim (`int`):
+            Number of input channels.
+        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
+            Normalization layer class.
+    """
+    def __init__(self, input_resolution: Tuple[int], dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+    def maybe_pad(self, input_feature, height, width):
+        should_pad = (height % 2 == 1) or (width % 2 == 1)
+        if should_pad:
+            pad_values = (0, 0, 0, width % 2, 0, height % 2)
+            input_feature = nn.functional.pad(input_feature, pad_values)
+        return input_feature
+    def forward(self, input_feature: torch.Tensor, input_dimensions: Tuple[int, int]) -> torch.Tensor:
+        height, width = input_dimensions
+        # `dim` is height * width
+        batch_size, dim, num_channels = input_feature.shape
+        input_feature = input_feature.view(batch_size, height, width, num_channels)
+        # pad input to be disible by width and height, if needed
+        input_feature = self.maybe_pad(input_feature, height, width)
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_0 = input_feature[:, 0::2, 0::2, :]
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_1 = input_feature[:, 1::2, 0::2, :]
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_2 = input_feature[:, 0::2, 1::2, :]
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_3 = input_feature[:, 1::2, 1::2, :]
+        # batch_size height/2 width/2 4*num_channels
+        input_feature = torch.cat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1)
+        input_feature = input_feature.view(batch_size, -1, 4 * num_channels)  # batch_size height/2*width/2 4*C
+        input_feature = self.norm(input_feature)
+        input_feature = self.reduction(input_feature)
+        return input_feature
+class VariableDonutSwinLayer(nn.Module):
+    def __init__(self, config, dim, input_resolution, num_heads, shift_size=0):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.shift_size = shift_size
+        self.window_size = config.window_size
+        self.input_resolution = input_resolution
+        self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.attention = DonutSwinAttention(config, dim, num_heads, window_size=self.window_size)
+        self.drop_path = DonutSwinDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+        self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.intermediate = DonutSwinIntermediate(config, dim)
+        self.output = DonutSwinOutput(config, dim)
+    def set_shift_and_window_size(self, input_resolution):
+        if min(input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(input_resolution)
+    def get_attn_mask(self, height, width, dtype):
+        if self.shift_size > 0:
+            # calculate attention mask for SW-MSA
+            img_mask = torch.zeros((1, height, width, 1), dtype=dtype)
+            height_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            width_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            count = 0
+            for height_slice in height_slices:
+                for width_slice in width_slices:
+                    img_mask[:, height_slice, width_slice, :] = count
+                    count += 1
+            mask_windows = window_partition(img_mask, self.window_size)
+            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        else:
+            attn_mask = None
+        return attn_mask
+    def maybe_pad(self, hidden_states, height, width):
+        pad_right = (self.window_size - width % self.window_size) % self.window_size
+        pad_bottom = (self.window_size - height % self.window_size) % self.window_size
+        pad_values = (0, 0, 0, pad_right, 0, pad_bottom)
+        hidden_states = nn.functional.pad(hidden_states, pad_values)
+        return hidden_states, pad_values
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_dimensions: Tuple[int, int],
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if not always_partition:
+            self.set_shift_and_window_size(input_dimensions)
+        else:
+            pass
+        height, width = input_dimensions
+        batch_size, _, channels = hidden_states.size()
+        shortcut = hidden_states
+        hidden_states = self.layernorm_before(hidden_states)
+        hidden_states = hidden_states.view(batch_size, height, width, channels)
+        # pad hidden_states to multiples of window size
+        hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)
+        _, height_pad, width_pad, _ = hidden_states.shape
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_hidden_states = torch.roll(hidden_states, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+        else:
+            shifted_hidden_states = hidden_states
+        # partition windows
+        hidden_states_windows = window_partition(shifted_hidden_states, self.window_size)
+        hidden_states_windows = hidden_states_windows.view(-1, self.window_size * self.window_size, channels)
+        attn_mask = self.get_attn_mask(height_pad, width_pad, dtype=hidden_states.dtype)
+        if attn_mask is not None:
+            attn_mask = attn_mask.to(hidden_states_windows.device)
+        attention_outputs = self.attention(
+            hidden_states_windows, attn_mask, head_mask, output_attentions=output_attentions
+        )
+        attention_output = attention_outputs[0]
+        attention_windows = attention_output.view(-1, self.window_size, self.window_size, channels)
+        shifted_windows = window_reverse(attention_windows, self.window_size, height_pad, width_pad)
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            attention_windows = torch.roll(shifted_windows, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            attention_windows = shifted_windows
+        was_padded = pad_values[3] > 0 or pad_values[5] > 0
+        if was_padded:
+            attention_windows = attention_windows[:, :height, :width, :].contiguous()
+        attention_windows = attention_windows.view(batch_size, height * width, channels)
+        hidden_states = shortcut + self.drop_path(attention_windows)
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+        layer_output = hidden_states + self.output(layer_output)
+        layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
+        return layer_outputs
+class VariableDonutSwinStage(nn.Module):
+    def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, downsample):
+        super().__init__()
+        self.config = config
+        self.dim = dim
+        self.blocks = nn.ModuleList(
+            [
+                VariableDonutSwinLayer(
+                    config=config,
+                    dim=dim,
+                    input_resolution=input_resolution,
+                    num_heads=num_heads,
+                    shift_size=0 if (i % 2 == 0) else int(config.window_size // 2),
+                )
+                for i in range(depth)
+            ]
+        )
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(input_resolution, dim=dim, norm_layer=nn.LayerNorm)
+        else:
+            self.downsample = None
+        self.pointing = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_dimensions: Tuple[int, int],
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        height, width = input_dimensions
+        for i, layer_module in enumerate(self.blocks):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            layer_outputs = layer_module(
+                hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
+            )
+            hidden_states = layer_outputs[0]
+        hidden_states_before_downsampling = hidden_states
+        if self.downsample is not None:
+            height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2
+            output_dimensions = (height, width, height_downsampled, width_downsampled)
+            hidden_states = self.downsample(hidden_states_before_downsampling, input_dimensions)
+        else:
+            output_dimensions = (height, width, height, width)
+        stage_outputs = (hidden_states, hidden_states_before_downsampling, output_dimensions)
+        if output_attentions:
+            stage_outputs += layer_outputs[1:]
+        return stage_outputs
+class VariableDonutSwinEncoder(nn.Module):
+    def __init__(self, config, grid_size):
+        super().__init__()
+        self.num_layers = len(config.depths)
+        self.config = config
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
+        self.layers = nn.ModuleList(
+            [
+                VariableDonutSwinStage(
+                    config=config,
+                    dim=int(config.embed_dim * 2**i_layer),
+                    input_resolution=(grid_size[0] // (2**i_layer), grid_size[1] // (2**i_layer)),
+                    depth=config.depths[i_layer],
+                    num_heads=config.num_heads[i_layer],
+                    drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
+                    downsample=VariableDonutSwinPatchMerging if (i_layer < self.num_layers - 1) else None,
+                )
+                for i_layer in range(self.num_layers)
+            ]
+        )
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_dimensions: Tuple[int, int],
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        output_hidden_states_before_downsampling: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple, DonutSwinEncoderOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_reshaped_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        if output_hidden_states:
+            batch_size, _, hidden_size = hidden_states.shape
+            # rearrange b (h w) c -> b c h w
+            reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
+            reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+            all_hidden_states += (hidden_states,)
+            all_reshaped_hidden_states += (reshaped_hidden_state,)
+        for i, layer_module in enumerate(self.layers):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    input_dimensions,
+                    layer_head_mask,
+                    output_attentions,
+                    always_partition,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
+                )
+            hidden_states = layer_outputs[0]
+            hidden_states_before_downsampling = layer_outputs[1]
+            output_dimensions = layer_outputs[2]
+            input_dimensions = (output_dimensions[-2], output_dimensions[-1])
+            if output_hidden_states and output_hidden_states_before_downsampling:
+                batch_size, _, hidden_size = hidden_states_before_downsampling.shape
+                # rearrange b (h w) c -> b c h w
+                # here we use the original (not downsampled) height and width
+                reshaped_hidden_state = hidden_states_before_downsampling.view(
+                    batch_size, *(output_dimensions[0], output_dimensions[1]), hidden_size
+                )
+                reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+                all_hidden_states += (hidden_states_before_downsampling,)
+                all_reshaped_hidden_states += (reshaped_hidden_state,)
+            elif output_hidden_states and not output_hidden_states_before_downsampling:
+                batch_size, _, hidden_size = hidden_states.shape
+                # rearrange b (h w) c -> b c h w
+                reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
+                reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+                all_hidden_states += (hidden_states,)
+                all_reshaped_hidden_states += (reshaped_hidden_state,)
+            if output_attentions:
+                all_self_attentions += layer_outputs[3:]
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return DonutSwinEncoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            reshaped_hidden_states=all_reshaped_hidden_states,
+        )
+class VariableDonutSwinModel(DonutSwinModel):
+    config_class = VariableDonutSwinConfig
+    def __init__(self, config, add_pooling_layer=True, use_mask_token=False, **kwargs):
+        super().__init__(config)
+        self.config = config
+        self.num_layers = len(config.depths)
+        self.num_features = int(config.embed_dim * 2 ** (self.num_layers - 1))
+        self.embeddings = VariableDonutSwinEmbeddings(config, use_mask_token=use_mask_token)
+        self.encoder = VariableDonutSwinEncoder(config, self.embeddings.patch_grid)
+        self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ) -> Union[Tuple, DonutSwinModelOutput]:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, len(self.config.depths))
+        embedding_output, input_dimensions = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
+        encoder_outputs = self.encoder(
+            embedding_output,
+            input_dimensions,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = None
+        if self.pooler is not None:
+            pooled_output = self.pooler(sequence_output.transpose(1, 2))
+            pooled_output = torch.flatten(pooled_output, 1)
+        if not return_dict:
+            output = (sequence_output, pooled_output) + encoder_outputs[1:]
+            return output
+        return DonutSwinModelOutput(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
+        )

surya/model/recognition/model.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import warnings
+import torch
+warnings.filterwarnings("ignore", message="torch.utils._pytree._register_pytree_node is deprecated")
+import logging
+logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)
+from typing import List, Optional, Tuple
+from transformers import VisionEncoderDecoderModel, VisionEncoderDecoderConfig, AutoModel, AutoModelForCausalLM
+from surya.model.recognition.config import MBartMoEConfig, VariableDonutSwinConfig
+from surya.model.recognition.encoder import VariableDonutSwinModel
+from surya.model.recognition.decoder import MBartMoE
+from surya.settings import settings
+def load_model(checkpoint=settings.RECOGNITION_MODEL_CHECKPOINT, device=settings.TORCH_DEVICE_MODEL, dtype=settings.MODEL_DTYPE, langs: Optional[List[int]] = None):
+    config = VisionEncoderDecoderConfig.from_pretrained(checkpoint)
+    # Prune moe experts that are not needed before loading the model
+    if langs is not None:
+        config.decoder.langs = {lang_iso : lang_int for lang_iso, lang_int in config.decoder.langs.items() if lang_int in langs}
+    decoder_config = vars(config.decoder)
+    decoder = MBartMoEConfig(**decoder_config)
+    config.decoder = decoder
+    encoder_config = vars(config.encoder)
+    encoder = VariableDonutSwinConfig(**encoder_config)
+    config.encoder = encoder
+    # Get transformers to load custom encoder/decoder
+    AutoModel.register(MBartMoEConfig, MBartMoE)
+    AutoModelForCausalLM.register(MBartMoEConfig, MBartMoE)
+    AutoModel.register(VariableDonutSwinConfig, VariableDonutSwinModel)
+    model = LangVisionEncoderDecoderModel.from_pretrained(checkpoint, config=config, torch_dtype=dtype)
+    assert isinstance(model.decoder, MBartMoE)
+    assert isinstance(model.encoder, VariableDonutSwinModel)
+    model = model.to(device)
+    model = model.eval()
+    print(f"Loaded recognition model {checkpoint} on device {device} with dtype {dtype}")
+    return model
+class LangVisionEncoderDecoderModel(VisionEncoderDecoderModel):
+    def prepare_inputs_for_generation(
+            self, input_ids, decoder_langs=None, past_key_values=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
+    ):
+        decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, langs=decoder_langs, past_key_values=past_key_values)
+        decoder_attention_mask = decoder_inputs["attention_mask"] if "attention_mask" in decoder_inputs else None
+        input_dict = {
+            "attention_mask": attention_mask,
+            "decoder_attention_mask": decoder_attention_mask,
+            "decoder_input_ids": decoder_inputs["input_ids"],
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": decoder_inputs["past_key_values"],
+            "use_cache": use_cache,
+            "decoder_langs": decoder_inputs["langs"],
+        }
+        return input_dict