|
import h5py |
|
import lmdb |
|
import numpy as np |
|
import msgpack |
|
from utils.basic_utils import load_json, save_json |
|
from tqdm import tqdm |
|
import os |
|
|
|
data_path = "/home/renjie.liang/11_TVR-Ranking/ReLoCLNet/data/TVR_Ranking/train_top40.json" |
|
|
|
|
|
old_data = load_json(data_path) |
|
|
|
new_data_path = "./data/TVR_Ranking_CONQUER/train_top40.json" |
|
|
|
|
|
new_vr_path = "data/TVR_Ranking_train_top100_hero" |
|
|
|
|
|
|
|
|
|
|
|
os.makedirs(new_vr_path, exist_ok=True) |
|
|
|
consolidated_path = "/home/renjie.liang/datasets/tvr_feature_release/data/consolidated_vr_results" |
|
vr_pool = lmdb.open(consolidated_path, readonly=True, create=False, max_readers=4096 * 8, readahead=False) |
|
vr_txn = vr_pool.begin(buffers=True) |
|
|
|
|
|
new_vr_pool = lmdb.open(new_vr_path, readonly=False, create=True, max_dbs=0, map_size=10 * 1024**3) |
|
clean_data = [] |
|
with new_vr_pool.begin(write=True) as new_vr_txn: |
|
for i in tqdm(old_data): |
|
query_id = i["query_id"] |
|
|
|
vr_data = vr_txn.get(str(query_id).encode()) |
|
if vr_data is not None: |
|
clean_data.append(i) |
|
|
|
vr_res = msgpack.loads(vr_data) |
|
|
|
vr_data_serialized = msgpack.dumps(vr_res) |
|
new_vr_txn.put(str(query_id).encode(), vr_data_serialized) |
|
|
|
|
|
save_json(clean_data, new_data_path) |
|
print(len(old_data), "->", len(clean_data)) |
|
vr_pool.close() |
|
new_vr_pool.close() |