Spaces:
Build error
Build error
File size: 3,955 Bytes
c32ee7d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import csv
from typing import List, Optional, Tuple
import pkg_resources
# from rich import inspect
from rich.pretty import pprint
from promptsource.templates import TemplateCollection
def preview() -> None:
experiment_path = pkg_resources.resource_filename(__name__, "experiment_D4.csv")
gsheet = {}
d4_train: List[Tuple[str, Optional[str]]] = []
d4_eval: List[Tuple[str, Optional[str]]] = []
d3_train_gpt: List[Tuple[str, Optional[str]]] = []
d3_train_sglue: List[Tuple[str, Optional[str]]] = []
experiment_path = pkg_resources.resource_filename(__name__, "experiment_D4.csv")
with open(experiment_path) as exp_file:
reader = csv.DictReader(exp_file)
for row in reader:
if row["skip"]:
continue
if row["subset"] == "":
row["subset"] = None # to match promptsource.Template object
dataset_subset = (row["HF_name"], row["subset"])
if row["do_train"] == "TRUE":
d4_train.append(dataset_subset)
if row["do_eval"] == "TRUE":
d4_eval.append(dataset_subset)
if row["D3_do_train"] == "TRUE" and "GPT" in row["seed_paper"]:
d3_train_gpt.append(dataset_subset)
if row["D3_do_train"] == "TRUE" and row["HF_name"] == "super_glue":
d3_train_sglue.append(dataset_subset)
gsheet[dataset_subset] = row
all_datasets = d4_train + d4_eval + d3_train_gpt + d3_train_sglue
print(f"Number of non-desk-rejected datasets = {len(all_datasets)}")
print(f"Number of training sets = {len(d4_train)}")
print(f"Number of evaluation sets = {len(d4_eval)}")
template_collection = TemplateCollection()
output = []
missing_og_flags = []
missing_metrics = []
for dataset_name, subset_name in template_collection.keys:
ds_name = (dataset_name, subset_name)
if ds_name not in d4_eval:
template_collection.remove(dataset_name, subset_name)
continue
OG = 0
non_OG = 0
dataset = template_collection.get_dataset(dataset_name, subset_name)
for template_name in dataset.all_template_names:
template = dataset[template_name]
# if dataset_name == 'ropes':
# inspect(template.metadata)
if not template.metadata.metrics:
missing_metrics.append(f"{dataset_name}/{subset_name}/{template_name}")
if template.metadata.original_task is True:
OG += 1
elif template.metadata.original_task is False:
non_OG += 1
elif template.metadata.original_task is None:
missing_og_flags.append(dataset_name + "/" + template_name)
continue
train_size = gsheet[ds_name]["train_size"]
if train_size == "":
train_size = 0
else:
train_size = int(train_size)
adjusted_train_size = train_size // len(dataset.all_template_names)
output.append(
(
f"{dataset_name} {subset_name if subset_name else ''}",
f"{OG}-{non_OG}",
f"{train_size:,} {adjusted_train_size:,}",
)
)
pprint(output)
print(len(template_collection))
print("Missing metrics:")
pprint(missing_metrics)
print("Missing original task flags:")
pprint(missing_og_flags)
# # print(d4_train_mixture)
# print(f"Number of training templates = {len(d4_train_mixture)}")
# # print(d4_eval_mixture)
# print(f"Number of evaluation templates = {len(d4_eval_mixture)}")
# # for i in seqio.TaskRegistry.names():
# # print(i)
# print(f"Number of SeqIO registered templates = {len(seqio.TaskRegistry.names())}")
# print("^ includes non-original task templates which are excluded from the eval mixture")
if __name__ == "__main__":
preview()
|