| import json | |
| import os | |
| from tqdm import tqdm | |
| JOINT_JSON_DIRECTORY = f"/home/{os.environ['USER']}/data/wit/all_jsons" | |
| SCALE_CONVERTED_DIRECTORY = f"/home/{os.environ['USER']}/data/wit_scale_converted" | |
| for split in ["train", "valid", "test"]: | |
| print("Reading json") | |
| with open(f"{JOINT_JSON_DIRECTORY}/{split}_dataset_all_98_1_1_split.json") as f: | |
| examples = [json.loads(line) for line in f.readlines()] | |
| valid_files = set(os.listdir(SCALE_CONVERTED_DIRECTORY)) | |
| supported_examples = [] | |
| for example in tqdm(examples): | |
| directory, filename = os.path.split(example["image_path"]) | |
| if filename in valid_files: | |
| example["image_path"] = os.path.join(SCALE_CONVERTED_DIRECTORY, filename) | |
| supported_examples.append(json.dumps(example, ensure_ascii=False)) | |
| print(f"Total {split} examples: {len(supported_examples)}") | |
| with open( | |
| f"{SCALE_CONVERTED_DIRECTORY}/{split}_dataset_scale_converted_98_1_1_split.json", | |
| "w", | |
| ) as f: | |
| f.write("\n".join(supported_examples)) | |
| print("DONE!") | |