import h5py import lmdb import numpy as np import msgpack from utils.basic_utils import load_json, save_json from tqdm import tqdm import os data_path = "/home/renjie.liang/11_TVR-Ranking/ReLoCLNet/data/TVR_Ranking/train_top40.json" # data_path = "/home/renjie.liang/11_TVR-Ranking/ReLoCLNet/data/TVR_Ranking/val.json" # data_path = "/home/renjie.liang/11_TVR-Ranking/ReLoCLNet/data/TVR_Ranking/test.json" old_data = load_json(data_path) new_data_path = "./data/TVR_Ranking_CONQUER/train_top40.json" # new_data_path = "./data/TVR_Ranking_CONQUER/val.json" # new_data_path = "./data/TVR_Ranking_CONQUER/test.json" new_vr_path = "data/TVR_Ranking_train_top100_hero" # new_vr_path = "data/TVR_Ranking_val_top100_hero" # new_vr_path = "data/TVR_Ranking_test_top100_hero" # Destination LMDB path (for writing) os.makedirs(new_vr_path, exist_ok=True) consolidated_path = "/home/renjie.liang/datasets/tvr_feature_release/data/consolidated_vr_results" vr_pool = lmdb.open(consolidated_path, readonly=True, create=False, max_readers=4096 * 8, readahead=False) vr_txn = vr_pool.begin(buffers=True) # Open the new LMDB for writing new_vr_pool = lmdb.open(new_vr_path, readonly=False, create=True, max_dbs=0, map_size=10 * 1024**3) # 10 GiB clean_data = [] with new_vr_pool.begin(write=True) as new_vr_txn: for i in tqdm(old_data): query_id = i["query_id"] # Retrieve the data from the source database vr_data = vr_txn.get(str(query_id).encode()) if vr_data is not None: clean_data.append(i) # Data exists, so load it using msgpack and then put it into the new database vr_res = msgpack.loads(vr_data) # Ensure the data is serialized before storing vr_data_serialized = msgpack.dumps(vr_res) new_vr_txn.put(str(query_id).encode(), vr_data_serialized) # Close both the source and destination databases after operations are complete save_json(clean_data, new_data_path) print(len(old_data), "->", len(clean_data)) vr_pool.close() new_vr_pool.close()