File size: 5,010 Bytes
b7fbd2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c3bdec
 
 
 
 
 
2f34d06
53cf224
b7fbd2a
 
 
8c3bdec
b7fbd2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75a8362
 
 
b7fbd2a
 
 
 
 
 
 
 
 
 
 
 
8c3bdec
 
 
 
b7fbd2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c3bdec
b7fbd2a
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import argparse
import json
import pathlib
import os

parser = argparse.ArgumentParser(
    description="Format the output of the data card tool as .md for the hub."
)
parser.add_argument("--input_path", "-i", type=pathlib.Path, required=False)
parser.add_argument("--output_path", "-o", type=pathlib.Path, required=False)
args = parser.parse_args()


def read_json_file(json_path: pathlib.Path):
    """Load a json file and return it as object."""
    with open(json_path, "r") as f:
        data = json.load(f)
    return data


def save_file(json_path: pathlib.Path, json_obj: str):
    """Takes a string and saves it as .md file."""
    with open(json_path, "w") as f:
        f.write(json.dumps(json_obj, indent=2))


def construct_json(dataset_name: str, data_card_data: dict, text_by_key: dict):
  """Constructs the json file

  This function iterates through text_by_key and extracts all answers from
  the data_card_data object. It uses the levels of hierarchy as indicator for
  the heading indentation and does not change the order in which anything
  appears.

  Args:
      data_card_data: Output from the data card tool
      text_by_key: configuration defined in key_to_question.json

  Returns:
      data_card_md_string: json content
  """

  try:
    website_link = data_card_data["overview"]["where"]["website"]
  except KeyError:
    website_link = ""
  try:
    paper_link = data_card_data["overview"]["where"]["paper-url"]
  except KeyError:
    paper_link = ""
  try:
    authors = data_card_data["overview"]["credit"]["creators"]
  except KeyError:
    authors = ""
  try:
    summary = data_card_data["overview"]["what"]["dataset"]
  except KeyError:
    summary = "Placeholder"


  # Add summary blurb with loading script and link to GEM loader
  summary +=f"\n\nYou can load the dataset via:\n```\nimport datasets\ndata = datasets.load_dataset('GEM/{dataset_name}')\n```\nThe data loader can be found [here](https://huggingface.co/datasets/GEM/{dataset_name})."

  new_json = {
      "name": dataset_name,
      "summary": summary,
      "sections": [
      ],
  }

  if website_link:
    new_json["website"] = website_link
  if paper_link:
    new_json["paper"] = paper_link
  if authors:
    new_json["authors"] = authors


  total_questions = 0
  total_words = 0

  for main_key, main_content in text_by_key.items():
    l2_data = {
              "title": main_content["section-title"],
              "level": 2,
              "subsections": []
    }
    if main_key not in data_card_data:
      continue
    for second_key, second_content in main_content.items():
      if second_key == "section-title":
        continue
      # Skip summary data since it is already in the header.
      if main_key == "overview" and second_key == "what":
        continue
      l3_data = {
                      "title": second_content["section-title"],
                      "level": 3,
                      "fields": []
      }
      for final_key, final_content in second_content.items():
        if final_key == "section-title":
          continue
        try:
          total_questions += 1
          answer = data_card_data[main_key][second_key].get(final_key, "N/A")
        except:
          # print(main_key, second_key, final_key)
          # print("==="*50)
          # print(data_card_data)
          continue
        # Skip empty answers.
        if isinstance(answer, str):
          if answer.lower() == "n/a":
            continue
        if not answer:
          continue

        if isinstance(answer, list):
          answer = ", ".join([f"`{a}`" for a in answer])

        json_answer = {
          "title": final_content["title"],
          "level": 4,
          "content": answer,
          "flags": final_content["flags"],
          "info": final_content["info"],
          "scope": final_content["scope"],
        }
        total_words += len(answer.split())
        l3_data["fields"].append(json_answer)
      l2_data["subsections"].append(l3_data)
    new_json["sections"].append(l2_data)
  print(f"Total questions {total_questions}")
  print(f"total words: {total_words}")
  return new_json, total_words




if __name__ == "__main__":

  text_by_key = read_json_file(
      os.path.join(os.path.dirname(__file__), "key_to_question.json")
  )
  total_words_across_everything = 0
  for dataset in os.listdir("../../../GEMv2"):
    data_card_path = f"../../../GEMv2/{dataset}/{dataset}.json"
    if os.path.exists(data_card_path):
      print(f"Now processing {dataset}.")
      new_path = f"datacards/{dataset}.json"
      data_card_data = read_json_file(data_card_path)
      data_card_json, total_cur_words = construct_json(dataset, data_card_data, text_by_key)
      total_words_across_everything += total_cur_words

      save_file(new_path, data_card_json)
    else:
      print(f"{dataset} has no data card!")
  print(total_words_across_everything)
  # data_card_json = construct_json(data_card_data, text_by_key)
  # save_file(args.output_path, data_card_json)