CONQUER_RVMR / unused /select_conquer_dataset.py
Liangrj5
init
a638e43
import h5py
import lmdb
import numpy as np
import msgpack
from utils.basic_utils import load_json, save_json
from tqdm import tqdm
import os
data_path = "/home/renjie.liang/11_TVR-Ranking/ReLoCLNet/data/TVR_Ranking/train_top40.json"
# data_path = "/home/renjie.liang/11_TVR-Ranking/ReLoCLNet/data/TVR_Ranking/val.json"
# data_path = "/home/renjie.liang/11_TVR-Ranking/ReLoCLNet/data/TVR_Ranking/test.json"
old_data = load_json(data_path)
new_data_path = "./data/TVR_Ranking_CONQUER/train_top40.json"
# new_data_path = "./data/TVR_Ranking_CONQUER/val.json"
# new_data_path = "./data/TVR_Ranking_CONQUER/test.json"
new_vr_path = "data/TVR_Ranking_train_top100_hero"
# new_vr_path = "data/TVR_Ranking_val_top100_hero"
# new_vr_path = "data/TVR_Ranking_test_top100_hero"
# Destination LMDB path (for writing)
os.makedirs(new_vr_path, exist_ok=True)
consolidated_path = "/home/renjie.liang/datasets/tvr_feature_release/data/consolidated_vr_results"
vr_pool = lmdb.open(consolidated_path, readonly=True, create=False, max_readers=4096 * 8, readahead=False)
vr_txn = vr_pool.begin(buffers=True)
# Open the new LMDB for writing
new_vr_pool = lmdb.open(new_vr_path, readonly=False, create=True, max_dbs=0, map_size=10 * 1024**3) # 10 GiB
clean_data = []
with new_vr_pool.begin(write=True) as new_vr_txn:
for i in tqdm(old_data):
query_id = i["query_id"]
# Retrieve the data from the source database
vr_data = vr_txn.get(str(query_id).encode())
if vr_data is not None:
clean_data.append(i)
# Data exists, so load it using msgpack and then put it into the new database
vr_res = msgpack.loads(vr_data)
# Ensure the data is serialized before storing
vr_data_serialized = msgpack.dumps(vr_res)
new_vr_txn.put(str(query_id).encode(), vr_data_serialized)
# Close both the source and destination databases after operations are complete
save_json(clean_data, new_data_path)
print(len(old_data), "->", len(clean_data))
vr_pool.close()
new_vr_pool.close()