Produce ColBERT-KO Evaluation Results
#1
by
charlieCs
- opened
Could you let me know how the average results of the evaluation were calculated?
Hi, sorry for the late response. This is the code for native colbert evaluation.
from __future__ import annotations
import os
import logging
from multiprocessing import Process, current_process
import torch
import hashlib
import mteb
from mteb import MTEB, get_tasks
from mteb.models.colbert_models import ColBERTWrapper
from setproctitle import setproctitle
import traceback
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("main")
TASK_LIST_RETRIEVAL_GPU_MAPPING = {
1: [
"Ko-StrategyQA",
"AutoRAGRetrieval",
"PublicHealthQA",
"BelebeleRetrieval",
# "XPQARetrieval",
"MultiLongDocRetrieval",
],
2: ["MIRACLRetrieval"],
3: ["MrTidyRetrieval"],
}
model_names = [
# my_model_directory
]
model_names = [
"jinaai/jina-colbert-v2",
"/data_x/yjoonjang/COLBERT/MODELS/sft-base=A.X-Encoder-base-data=data_mixed_241108-bs=256-ep=1-Q=32-D=1024-lr=1e-5-250924-fp32"
] + model_names
save_path = "./results"
def evaluate_model(model_name, gpu_id, tasks):
import torch
try:
device = torch.device(f"cuda:{str(gpu_id)}")
torch.cuda.set_device(device)
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
if "jinaai/jina-colbert-v2" in model_name:
model = mteb.get_model(model_name, model_kwargs={"torch_dtype": torch.float32}, trust_remote_code=True, device=device)
else:
model = ColBERTWrapper(model_name, model_kwargs={"torch_dtype": torch.float32}, device=device)
if hasattr(model.model[0].tokenizer, 'model_max_length'):
model.model[0].tokenizer.model_input_names = ['input_ids', 'attention_mask']
output_folder_name = os.path.basename(model_name)
if os.path.isdir(model_name) and len(output_folder_name) > 100:
model_hash = hashlib.md5(model_name.encode()).hexdigest()[:6]
output_folder_name = f"{output_folder_name[:93]}_{model_hash}"
if os.path.isdir(model_name):
try:
model.model_meta.name = output_folder_name
except AttributeError:
logger.warning("Could not override model_meta.name. Path might still be too long.")
setproctitle(f"{output_folder_name}-{gpu_id}")
print(f"Running tasks: {tasks} / {model_name} on GPU {gpu_id} in process {current_process().name}")
evaluation = MTEB(
tasks=get_tasks(tasks=tasks, languages=["kor-Kore", "kor-Hang", "kor_Hang"])
)
if "jinaai/jina-colbert-v2" in model_name:
batch_size = 512
else:
batch_size = 32
evaluation.run(
model,
output_folder=f"{save_path}/{output_folder_name}",
encode_kwargs={"batch_size": batch_size},
)
except Exception as ex:
print(ex)
traceback.print_exc()
if __name__ == "__main__":
torch.multiprocessing.set_start_method('spawn')
for model_name in model_names:
print(f"Starting evaluation for model: {model_name}")
processes = []
for gpu_id, tasks in TASK_LIST_RETRIEVAL_GPU_MAPPING.items():
p = Process(target=evaluate_model, args=(model_name, gpu_id, tasks))
p.start()
processes.append(p)
for p in processes:
p.join()
print(f"Completed evaluation for model: {model_name}")
For the MIRACLRetrieval and MrTidyRetrieval, my server could not handle the vectors, so I couldn't evaluate on those datasets.
For muvera evaluation, you can check the github:
https://github.com/yjoonjang/muvera-py/blob/feat/pylate/main_pylate.py
- Youngjoon Jang
yjoonjang
changed discussion status to
closed