Spaces:

mrm8488
/

PromptSource

Build error

App Files Files Community

PromptSource / promptsource /utils.py

mrm8488

First commit

c32ee7d about 3 years ago

raw

history blame

3.67 kB

	# coding=utf-8

	import datasets
	import requests

	from promptsource.templates import INCLUDED_USERS


	def removeHyphen(example):
	example_clean = {}
	for key in example.keys():
	if "-" in key:
	new_key = key.replace("-", "_")
	example_clean[new_key] = example[key]
	else:
	example_clean[key] = example[key]
	example = example_clean
	return example


	def renameDatasetColumn(dataset):
	col_names = dataset.column_names
	for cols in col_names:
	if "-" in cols:
	dataset = dataset.rename_column(cols, cols.replace("-", "_"))
	return dataset


	#
	# Helper functions for datasets library
	#


	def get_dataset_builder(path, conf=None):
	"Get a dataset builder from name and conf."
	module_path = datasets.load.prepare_module(path, dataset=True)
	builder_cls = datasets.load.import_main_class(module_path[0], dataset=True)
	if conf:
	builder_instance = builder_cls(name=conf, cache_dir=None, hash=module_path[1])
	else:
	builder_instance = builder_cls(cache_dir=None, hash=module_path[1])
	return builder_instance


	def get_dataset(path, conf=None):
	"Get a dataset from name and conf."
	builder_instance = get_dataset_builder(path, conf)
	if builder_instance.manual_download_instructions is None and builder_instance.info.size_in_bytes is not None:
	builder_instance.download_and_prepare()
	return builder_instance.as_dataset()
	else:
	return datasets.load_dataset(path, conf)


	def get_dataset_confs(path):
	"Get the list of confs for a dataset."
	module_path = datasets.load.prepare_module(path, dataset=True)
	# Get dataset builder class from the processing script
	builder_cls = datasets.load.import_main_class(module_path[0], dataset=True)
	# Instantiate the dataset builder
	confs = builder_cls.BUILDER_CONFIGS
	if confs and len(confs) > 1:
	return confs
	return []


	def render_features(features):
	"""Recursively render the dataset schema (i.e. the fields)."""
	if isinstance(features, dict):
	return {k: render_features(v) for k, v in features.items()}
	if isinstance(features, datasets.features.ClassLabel):
	return features.names

	if isinstance(features, datasets.features.Value):
	return features.dtype

	if isinstance(features, datasets.features.Sequence):
	return {"[]": render_features(features.feature)}
	return features


	#
	# Loads dataset information
	#


	def filter_english_datasets():
	"""
	Filter English datasets based on language tags in metadata.

	Also includes the datasets of any users listed in INCLUDED_USERS
	"""
	english_datasets = []

	response = requests.get("https://huggingface.co/api/datasets?full=true")
	tags = response.json()

	for dataset in tags:
	dataset_name = dataset["id"]

	is_community_dataset = "/" in dataset_name
	if is_community_dataset:
	user = dataset_name.split("/")[0]
	if user in INCLUDED_USERS:
	english_datasets.append(dataset_name)
	continue

	if "card_data" not in dataset:
	continue
	metadata = dataset["card_data"]

	if "languages" not in metadata:
	continue
	languages = metadata["languages"]

	if "en" in languages or "en-US" in languages:
	english_datasets.append(dataset_name)

	return sorted(english_datasets)


	def list_datasets(template_collection, _state):
	"""Get all the datasets to work with."""
	dataset_list = filter_english_datasets()
	dataset_list.sort(key=lambda x: x.lower())
	return dataset_list