File size: 1,328 Bytes
97e4014
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41

import os, sys

import argparse

from datasets import load_dataset, concatenate_datasets, Dataset
from huggingface_hub import login

path = os.path.abspath(os.path.dirname(__file__))
sys.path.insert(0, path)

def merge_dataset(datapaths) -> Dataset:
    datapaths = datapaths.split(",")
    dataset = load_dataset(datapaths[0], split="train")

    for i in range(1, len(datapaths)):
        data = load_dataset(datapaths[i], split="train")
        data = concatenate_datasets([dataset, data])
    
    return dataset


if __name__=="__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--datapaths", type=str, default="")
    parser.add_argument("--huggingface_hub_token", type=str, default="")
    parser.add_argument("--split", type=str, default="train")
    args = parser.parse_args()

    print("=========================================")
    print('\n'.join(f' + {k}={v}' for k, v in vars(args).items()))
    print("=========================================")

    login(token=args.huggingface_hub_token)
    print("Successfully logged in to Huggingface Hub")

    dataset = merge_dataset(datapaths=args.datapaths)
    
    DATASET_ID = "qds-triplet-dialogsum"
    dataset.push_to_hub(DATASET_ID)
    print(f"Successful push to Huggingface Hub: {DATASET_ID}")