File size: 4,165 Bytes
97e4014
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113

from datasets import load_dataset
from datasets import DatasetDict, Dataset
import random
from transformers import set_seed


def ingest_data(datapath: str) -> DatasetDict:
    set_seed(42)

    QDS_LIMIT = 6000
    if "," in datapath:
        datapaths = datapath.split(",")
    
    datapath1 = "binwang/InstructDS_datasets"
    datapath2 = "binwang/InstructDS_datasets"

    all_train_data = []
    origin_train_dialogsum = load_dataset(datapath1, "DialogSum", split="train")
    qds_dialogsum = load_dataset(datapath2, "DialogSum_QDS", split="train")

    new_data1 = []
    for sample in origin_train_dialogsum:
        new_sample = {
            "instruction": "Please summarize the following dialogue.",
            "input": sample["dialogue"],
            "output": sample["summary"]
        }
        new_data1.append(new_sample)
    origin_train_dialogsum = new_data1
    all_train_data.extend(origin_train_dialogsum)

    print("Len of origin_train_dialogsum: ", len(origin_train_dialogsum))
    print("Len of all train data 1: ", len(all_train_data))
    
    new_data2 = []
    for sample in qds_dialogsum:
        new_sample = {
            "instruction": "Please answer the following question.",
            "input": sample["dialogue"],
            "output": sample["summary"]
        }
        new_data2.append(new_sample)
    qds_dialogsum = new_data2
    qds_dialogsum = random.sample(qds_dialogsum, QDS_LIMIT)
    all_train_data.extend(qds_dialogsum)
    print("Len of all train data 2: ", len(all_train_data))


    naive_all_train_data_dict = {
        "instruction": [item["instruction"] for item in all_train_data],
        "input": [item["input"] for item in all_train_data],
        "output": [item["output"] for item in all_train_data]
    }

    print("Len of naive_all_train_data_dict: ", len(naive_all_train_data_dict["instruction"]))

    subset_train_data = all_train_data
    with_len_train_data_dict = {
        "instruction": [item["instruction"] + f" The output should be {len(item['output'].split())} words long." for item in subset_train_data],
        "input": [item["input"] for item in subset_train_data],
        "output": [item["output"] for item in subset_train_data]
    }

    print("Len of with_len_train_data_dict: ", len(with_len_train_data_dict["instruction"]))

    all_train_data_dict = {
        "instruction": naive_all_train_data_dict["instruction"] + with_len_train_data_dict["instruction"],
        "input": naive_all_train_data_dict["input"] + with_len_train_data_dict["input"],
        "output": naive_all_train_data_dict["output"] + with_len_train_data_dict["output"]
    }

    print("Len of all_train_data_dict: ", len(all_train_data_dict["instruction"]))

    raw_train_data = Dataset.from_dict(all_train_data_dict)
    train_data = raw_train_data.shuffle()

    print(type(train_data))
    print(train_data["instruction"][:10])
    print(train_data["input"][:10])
    print(train_data["output"][:10])

    print("===================", len(train_data), "===================")

    # Validation data
    all_validation_data = []
    origin_validation_dialogsum = load_dataset(datapath1, "DialogSum", split="validation")

    new_data1 = []
    for sample in origin_validation_dialogsum:
        new_sample = {
            "instruction": "Please summarize the following dialogue.",
            "input": sample["dialogue"],
            "output": sample["summary"]
        }
        new_data1.append(new_sample)
    
    origin_validation_dialogsum = new_data1
    all_validation_data.extend(origin_validation_dialogsum)

    all_validation_data_dict = {
        "instruction": [item["instruction"] for item in all_validation_data],
        "input": [item["input"] for item in all_validation_data],
        "output": [item["output"] for item in all_validation_data]
    }

    raw_validation_data = Dataset.from_dict(all_validation_data_dict)
    validation_data = raw_validation_data.shuffle()

    return DatasetDict({
        "train": train_data,
        "validation": validation_data
    })