import requests from bs4 import BeautifulSoup import re import json import os import pathlib import shutil def get_ir_dataset_names(): url = "https://raw.githubusercontent.com/allenai/ir_datasets/master/ir_datasets/etc/metadata.json" # read in the json with requests.get(url) as r: data = json.loads(r.text) names = [] for dataset in data: if "docs" in data[dataset] and "queries" in data[dataset] and "qrels" in data[dataset]: names.append(dataset) return names if __name__ == "__main__": names = get_ir_dataset_names() with open("ir_dataset_names.json", "w") as fout: json.dump(names, fout, indent=4)