annotate-relevance / scripts /collect_ir_dataset_names.py
Orion Weller
updates, charts, ir_datasetes
68ecf38
raw
history blame
675 Bytes
import requests
from bs4 import BeautifulSoup
import re
import json
import os
import pathlib
import shutil
def get_ir_dataset_names():
url = "https://raw.githubusercontent.com/allenai/ir_datasets/master/ir_datasets/etc/metadata.json"
# read in the json
with requests.get(url) as r:
data = json.loads(r.text)
names = []
for dataset in data:
if "docs" in data[dataset] and "queries" in data[dataset] and "qrels" in data[dataset]:
names.append(dataset)
return names
if __name__ == "__main__":
names = get_ir_dataset_names()
with open("ir_dataset_names.json", "w") as fout:
json.dump(names, fout, indent=4)