Spaces:
Runtime error
Runtime error
import os | |
import json | |
import datasets | |
from pathlib import Path | |
_DESCRIPTION = "Gigaword dataset" | |
_DOCUMENT = "document" | |
_ID = "id" | |
class GigawordDataset(datasets.GeneratorBasedBuilder): | |
VERSION = datasets.Version("1.0.0") | |
def _info(self): | |
return datasets.DatasetInfo( | |
description=_DESCRIPTION, | |
features=datasets.Features( | |
{ | |
_DOCUMENT: datasets.Value("string"), | |
_ID: datasets.Value("string"), | |
} | |
), | |
#supervised_keys=(_DOCUMENT, _SUMMARY), | |
) | |
def _split_generators(self, dl_manager): | |
"""Returns SplitGenerators.""" | |
data_dir = dl_manager._data_dir | |
return [ | |
datasets.SplitGenerator( | |
name=datasets.Split.TRAIN, | |
gen_kwargs={"path": os.path.join(data_dir, "train.jsonl"), "name": "train"} | |
), | |
datasets.SplitGenerator( | |
name=datasets.Split.VALIDATION, | |
gen_kwargs={"path": os.path.join(data_dir, "val.jsonl"), "name": "validation"} | |
), | |
] | |
def _generate_examples(self, path=None, name=None): | |
"""Yields examples.""" | |
with open(path, encoding="utf-8") as f: | |
for i, line in enumerate(f): | |
x = json.loads(line) | |
id = x["id"] | |
item = { | |
_ID: id, | |
_DOCUMENT: x["text"], | |
} | |
yield id, item | |