File size: 2,737 Bytes
0d62ce8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
"""Program Synthesis dataset from dreamcoder. https://github.com/ellisk42/ec"""
from random import choice, shuffle
import datasets

from dreamcoder.domains.text.makeTextTasks import makeTasks as textMakeTasks
from dreamcoder.domains.list.main import main as listMakeTasks


_DESCRIPTION = """\
Generated program synthesis datasets used to train dreamcoder.
"""
_FEATURES = datasets.Features(
    {
        "description": datasets.Value("string"),
        "input": datasets.Value("string"),
        "output": datasets.Value("string"),
        "types": datasets.Value("string")
    }
)

_HOMEPAGE = "https://github.com/ellisk42/ec"

_LICENSE = "MIT License"

_MAX_STEPS = 10


class infIterator:
    def __init__(self, make_mthd):
        self.make_mthd = make_mthd
        self.i = None

    def reset(self):
        tasks = self.make_mthd()

        rows = []
        for task in tasks:
            base = {
                'types': str(task.request),
                "description": task.name,
            }
            for (inp, outp) in task.examples:
                rows.append(dict(input=str(inp), output=str(outp), **base))

        shuffle(rows)
        self.rows = rows
        self.i = 0

    def step(self):
        if self.i is None:
            self.reset()
        row = self.rows[self.i]
        self.i += 1
        if self.i >= len(self.rows):
            self.reset()
        return row


class ProgramSynthesis(datasets.GeneratorBasedBuilder):
    """Program Synthesis dataset from dreamcoder."""

    VERSION = datasets.Version("1.1.0")
    BUILDER_CONFIGS = [
        datasets.BuilderConfig(name="text", version=VERSION, description="Text tasks."),
        datasets.BuilderConfig(name="list", version=VERSION, description="List tasks."),
        datasets.BuilderConfig(name="all", version=VERSION, description="All tasks at once."),
    ]
    DEFAULT_CONFIG_NAME = "all"

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=_FEATURES,
            supervised_keys=("input", "output"),
            homepage=_HOMEPAGE,
            license=_LICENSE,
        )

    def _split_generators(self, dl_manager):
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
            ),
        ]

    def _generate_examples(self):
        task_samples = {
            'text': infIterator(textMakeTasks),
            'list': infIterator(listMakeTasks)
        }
        for key in range(_MAX_STEPS):

            if self.config.name == 'all':
                dataset_type = choice(task_samples.keys())
            else:
                dataset_type = self.config.name

            yield key, task_samples[dataset_type].step()