microsoft
/

udop-large-512

Image-Text-to-Text

Model card Files Files and versions Community

nielsr HF staff commited on Mar 8, 2024

Commit

1797b1f

·

verified ·

1 Parent(s): 1b604c2

Update README.md

Files changed (1) hide show

README.md +8 -2

README.md CHANGED Viewed

@@ -27,15 +27,21 @@ from transformers import AutoProcessor, UdopForConditionalGeneration
 from datasets import load_dataset
 # load model and processor
-processor = AutoProcessor.from_pretrained("microsoft/udop-large-512", apply_ocr=False)
-model = UdopForConditionalGeneration.from_pretrained("microsoft/udop-large-512")
 dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
 example = dataset[0]
 image = example["image"]
 words = example["tokens"]
 boxes = example["bboxes"]
 question = "Question answering. What is the date on the form?"
 encoding = processor(image, question, words, boxes=boxes, return_tensors="pt")
 # autoregressive generation

 from datasets import load_dataset
 # load model and processor
+# in this case, we already have performed OCR ourselves
+# so we initialize the processor with `apply_ocr=False`
+processor = AutoProcessor.from_pretrained("microsoft/udop-large", apply_ocr=False)
+model = UdopForConditionalGeneration.from_pretrained("microsoft/udop-large")
+# load an example image, along with the words and coordinates
+# which were extracted using an OCR engine
 dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
 example = dataset[0]
 image = example["image"]
 words = example["tokens"]
 boxes = example["bboxes"]
 question = "Question answering. What is the date on the form?"
+# prepare everything for the model
 encoding = processor(image, question, words, boxes=boxes, return_tensors="pt")
 # autoregressive generation