from argparse import ArgumentParser from json import load import pathlib import os def multi_grep(d, l1, l2, l3): return d.get(l1, {}).get(l2, {}).get(l3, "[Needs More Information]") def multi_grep2(d, l1, l2, l3): return d.get(l1, {}).get(l2, {}).get(l3, ["unknown"]) def sanitize_md_url(s): """Strip out MD fragments if they exist.""" if len(s.split("](")) > 1: return s.split("](")[1].replace(")", "") else: return s # --- # annotations_creators: # - expert-generated # language_creators: # - found # languages: # - en # licenses: # - unknown # multilinguality: # - monolingual # pretty_name: FairytaleQA # size_categories: # - 10K<n<100K # source_datasets: # - original # task_categories: # - question-generation # task_ids: # - abstractive-qg # --- def construct_preamble(data, name): pre = "---\n" pre += "annotations_creators:\n" # - expert-generated s = multi_grep(data, "curation", "annotations", "origin") if s == "[Needs More Information]": pre += "- unknown\n" else: pre += "- " + s.replace(" ", "-") + "\n" pre += "language_creators:\n- unknown\n" pre += "languages:" languages = multi_grep2(data, "overview", "languages", "language_names") for l in languages: pre += f"\n- {l}" pre += "\nlicenses:\n" s = multi_grep(data, "overview", "languages", "license") if s == "[Needs More Information]": pre += "- unknown\n" else: pre += "- " + s.split(":")[0] + "\n" pre += "multilinguality:\n" if languages == ["unknown"]: pre += "- unknown" elif len(languages) == 1: pre += "- monolingual" else: pre += "- multilingual" # - monolingual pre += f"\npretty_name: {name}\n" pre += "size_categories:\n- unknown\n" pre += "source_datasets:\n- original\n" pre += "task_categories:\n" s = multi_grep(data, "overview", "languages", "task") if s == "[Needs More Information]": pre += "- unknown\n" else: pre += "- " + "-".join(s.lower().split(" ")) + "\n" # - question-generation pre += "task_ids:\n- unknown\n" # - abstractive-qg pre += "---\n\n" return pre ## Table of Contents # - [Dataset Description](#dataset-description) # - [Dataset Summary](#dataset-summary) # - [Supported Tasks](#supported-tasks-and-leaderboards) # - [Languages](#languages) # - [Dataset Structure](#dataset-structure) # - [Data Instances](#data-instances) # - [Data Fields](#data-instances) # - [Data Splits](#data-instances) # - [Dataset Creation](#dataset-creation) # - [Curation Rationale](#curation-rationale) # - [Source Data](#source-data) # - [Annotations](#annotations) # - [Personal and Sensitive Information](#personal-and-sensitive-information) # - [Considerations for Using the Data](#considerations-for-using-the-data) # - [Social Impact of Dataset](#social-impact-of-dataset) # - [Discussion of Biases](#discussion-of-biases) # - [Other Known Limitations](#other-known-limitations) # - [Additional Information](#additional-information) # - [Dataset Curators](#dataset-curators) # - [Licensing Information](#licensing-information) # - [Citation Information](#citation-information) def construct_toc(data): pass def construct_links(data): links = "## Dataset Description\n\n" s = sanitize_md_url(multi_grep(data, "overview", "where", "website")) links += f"- **Homepage:** {s}\n" s = sanitize_md_url(multi_grep(data, "overview", "where", "data-url")) links += f"- **Repository:** {s}\n" s = sanitize_md_url(multi_grep(data, "overview", "where", "paper-url")) links += f"- **Paper:** {s}\n" s = sanitize_md_url(multi_grep(data, "overview", "where", "leaderboard-url")) links += f"- **Leaderboard:** {s}\n" s = multi_grep(data, "overview", "where", "contact-name") links += f"- **Point of Contact:** {s}\n\n" return links def json_to_markdown(filename, original_json_path): json = load(open(filename)) original_json = load(open(original_json_path)) dataset_name = pathlib.Path(original_json_path).stem preamble = construct_preamble(original_json, dataset_name) markdown = preamble markdown += f'# Dataset Card for GEM/{json["name"]}\n\n' # ToC here. markdown += construct_links(original_json) markdown += "### Link to Main Data Card\n\n" markdown += f'You can find the main data card on the [GEM Website](https://gem-benchmark.com/data_cards/{dataset_name}).\n\n' markdown += "### Dataset Summary \n\n" markdown += json['summary'] + '\n\n' for key in json: if key not in ('name', 'summary', 'sections'): markdown += f'#### {key}\n{json[key]}\n\n' markdown += '\n'.join(section_to_markdown(section) \ for section in json['sections']) readme_path = os.path.join(pathlib.Path(original_json_path).parents[0], "README.md") with open(readme_path, 'w') as f: f.write(markdown) def section_to_markdown(section): markdown = f'{"#" * section["level"]} {section["title"]}\n\n' markdown += '\n'.join(subsection_to_markdown(subsection) \ for subsection in section['subsections']) return markdown + '\n' def subsection_to_markdown(subsection): markdown = f'{"#" * subsection["level"]} {subsection["title"]}\n\n' markdown += '\n'.join(field_to_markdown(field) \ for field in subsection['fields']) return markdown + '\n' def field_to_markdown(field): markdown = f'{"#" * field["level"]} {field["title"]}\n\n' if 'flags' in field and 'quick' in field['flags']: markdown += f'<!-- quick -->\n' if field.get('info', False): markdown += f'<!-- info: {field["info"]} -->\n' if field.get('scope', False): markdown += f'<!-- scope: {field["scope"]} -->\n' markdown += field.get('content', '') return markdown + '\n' # def main(): # """Converts JSON output from `reformat_json.py` # to Markdown input for Data Cards Labs.""" # args = parse_args() # for filename in args.input: # if filename[-5:] == '.json': # json_to_markdown(filename) if __name__ == "__main__": for dataset in os.listdir("../../../GEMv2"): data_card_path = f"../../../GEMv2/{dataset}/{dataset}.json" if os.path.exists(data_card_path): print(f"Now processing {dataset}.") # This script assumes you have run reformat_json.py new_path = f"datacards/{dataset}.json" md_string = json_to_markdown(new_path, data_card_path) else: print(f"{dataset} has no data card!")