mrm8488's picture
First commit
c32ee7d
raw
history blame
3.67 kB
# coding=utf-8
import datasets
import requests
from promptsource.templates import INCLUDED_USERS
def removeHyphen(example):
example_clean = {}
for key in example.keys():
if "-" in key:
new_key = key.replace("-", "_")
example_clean[new_key] = example[key]
else:
example_clean[key] = example[key]
example = example_clean
return example
def renameDatasetColumn(dataset):
col_names = dataset.column_names
for cols in col_names:
if "-" in cols:
dataset = dataset.rename_column(cols, cols.replace("-", "_"))
return dataset
#
# Helper functions for datasets library
#
def get_dataset_builder(path, conf=None):
"Get a dataset builder from name and conf."
module_path = datasets.load.prepare_module(path, dataset=True)
builder_cls = datasets.load.import_main_class(module_path[0], dataset=True)
if conf:
builder_instance = builder_cls(name=conf, cache_dir=None, hash=module_path[1])
else:
builder_instance = builder_cls(cache_dir=None, hash=module_path[1])
return builder_instance
def get_dataset(path, conf=None):
"Get a dataset from name and conf."
builder_instance = get_dataset_builder(path, conf)
if builder_instance.manual_download_instructions is None and builder_instance.info.size_in_bytes is not None:
builder_instance.download_and_prepare()
return builder_instance.as_dataset()
else:
return datasets.load_dataset(path, conf)
def get_dataset_confs(path):
"Get the list of confs for a dataset."
module_path = datasets.load.prepare_module(path, dataset=True)
# Get dataset builder class from the processing script
builder_cls = datasets.load.import_main_class(module_path[0], dataset=True)
# Instantiate the dataset builder
confs = builder_cls.BUILDER_CONFIGS
if confs and len(confs) > 1:
return confs
return []
def render_features(features):
"""Recursively render the dataset schema (i.e. the fields)."""
if isinstance(features, dict):
return {k: render_features(v) for k, v in features.items()}
if isinstance(features, datasets.features.ClassLabel):
return features.names
if isinstance(features, datasets.features.Value):
return features.dtype
if isinstance(features, datasets.features.Sequence):
return {"[]": render_features(features.feature)}
return features
#
# Loads dataset information
#
def filter_english_datasets():
"""
Filter English datasets based on language tags in metadata.
Also includes the datasets of any users listed in INCLUDED_USERS
"""
english_datasets = []
response = requests.get("https://huggingface.co/api/datasets?full=true")
tags = response.json()
for dataset in tags:
dataset_name = dataset["id"]
is_community_dataset = "/" in dataset_name
if is_community_dataset:
user = dataset_name.split("/")[0]
if user in INCLUDED_USERS:
english_datasets.append(dataset_name)
continue
if "card_data" not in dataset:
continue
metadata = dataset["card_data"]
if "languages" not in metadata:
continue
languages = metadata["languages"]
if "en" in languages or "en-US" in languages:
english_datasets.append(dataset_name)
return sorted(english_datasets)
def list_datasets(template_collection, _state):
"""Get all the datasets to work with."""
dataset_list = filter_english_datasets()
dataset_list.sort(key=lambda x: x.lower())
return dataset_list