### Create file named dataset.py ### Paste # coding=utf-8 import json import os from pathlib import Path import datasets from PIL import Image import pandas as pd logger = datasets.logging.get_logger(__name__) _CITATION = """{}""" _DESCRIPTION = """Discharge Summary""" def load_image(image_path): image = Image.open(image_path) w, h = image.size return image, (w, h) def normalize_bbox(bbox, size): return [ int(1000 * bbox[0] / size[0]), int(1000 * bbox[1] / size[1]), int(1000 * bbox[2] / size[0]), int(1000 * bbox[3] / size[1]), ] class SroieConfig(datasets.BuilderConfig): """BuilderConfig for SROIE""" def __init__(self, **kwargs): """BuilderConfig for SROIE. Args: **kwargs: keyword arguments forwarded to super. """ super(SroieConfig, self).__init__(**kwargs) class Sroie(datasets.GeneratorBasedBuilder): BUILDER_CONFIGS = [ SroieConfig(name="discharge", version=datasets.Version("1.0.0"), description="Discharge summary dataset"), ] def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features( { "id": datasets.Value("string"), "words": datasets.Sequence(datasets.Value("string")), "bboxes": datasets.Sequence(datasets.Sequence(datasets.Value("int64"))), "ner_tags": datasets.Sequence( datasets.features.ClassLabel( names=['others', 'produttore_key', 'produttore_value', 'cliente_key', 'cliente_value', 'unitloc_key', 'unitloc_value', 'operatore_key', 'operatore_value', 'referente_key', 'referente_value', 'cfproduttore_key', 'cfproduttore_value', 'telefono_key', 'telefono_value', 'emailcliente_key', 'emailcliente_value', 'datarichiesta_key', 'datarichiesta_value', 'orariorichiesta_key', 'orariorichiesta_value', 'emailproduttore_key', 'emailproduttore_value', 'mattina_key', 'mattina_value', 'pomeriggio_key', 'pomeriggio_value', 'cer_key', 'cer_value', 'descrizione_key', 'descrizione_value', 'sf_key', 'sf_value', 'classpericolo_key', 'classpericolo_value', 'destino_key', 'destino_value', 'confezionamento_key', 'confezionamento_value', 'destinazione_key', 'destinazione_value' ] ) ), #"image": datasets.Array3D(shape=(3, 224, 224), dtype="uint8"), "image_path": datasets.Value("string"), } ), supervised_keys=None, citation=_CITATION, homepage="", ) def _split_generators(self, dl_manager): """Returns SplitGenerators.""" """Uses local files located with data_dir""" #downloaded_file = dl_manager.download_and_extract(_URLS) # move files from the second URL together with files from the first one. dest = Path('dataset') return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={"filepath": dest/"train"} ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={"filepath": dest/"test"} ), ] def _generate_examples(self, filepath): logger.info("⏳ Generating examples from = %s", filepath) ann_dir = os.path.join(filepath, "annotation_dir") img_dir = os.path.join(filepath, "img_dir") for guid, fname in enumerate(sorted(os.listdir(img_dir))): name, ext = os.path.splitext(fname) file_path = os.path.join(ann_dir, name + ".csv") df = pd.read_csv(file_path) image_path = os.path.join(img_dir, fname) image, size = load_image(image_path) boxes = [[xmin, ymin, xmax, ymax] for xmin, ymin, xmax, ymax in zip(df['left'],df['top'],df['left']+df['width'],df['top']+df['height'])] text = [i for i in df['text']] label = [i for i in df['label']] boxes = [normalize_bbox(box, size) for box in boxes] print(image_path) for i in boxes: for j in i: if j>1000: print(j) pass yield guid, {"id": str(guid), "words": text, "bboxes": boxes, "ner_tags": label, "image_path": image_path}