Produce ColBERT-KO Evaluation Results

#1
by charlieCs - opened

Could you let me know how the average results of the evaluation were calculated?

Hi, sorry for the late response. This is the code for native colbert evaluation.

from __future__ import annotations

import os
import logging
from multiprocessing import Process, current_process
import torch
import hashlib

import mteb
from mteb import MTEB, get_tasks
from mteb.models.colbert_models import ColBERTWrapper

from setproctitle import setproctitle
import traceback
import logging


logging.basicConfig(level=logging.INFO)

logger = logging.getLogger("main")


TASK_LIST_RETRIEVAL_GPU_MAPPING = {
    1: [
        "Ko-StrategyQA",
        "AutoRAGRetrieval",
        "PublicHealthQA",
        "BelebeleRetrieval",
        # "XPQARetrieval",
        "MultiLongDocRetrieval",
    ],
    2: ["MIRACLRetrieval"],
    3: ["MrTidyRetrieval"],
}

model_names = [
    # my_model_directory
]
model_names = [
    "jinaai/jina-colbert-v2",
    "/data_x/yjoonjang/COLBERT/MODELS/sft-base=A.X-Encoder-base-data=data_mixed_241108-bs=256-ep=1-Q=32-D=1024-lr=1e-5-250924-fp32"
] + model_names

save_path = "./results"

def evaluate_model(model_name, gpu_id, tasks):
    import torch
    try:
        device = torch.device(f"cuda:{str(gpu_id)}") 
        torch.cuda.set_device(device)
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
        
        if "jinaai/jina-colbert-v2" in model_name:
            model = mteb.get_model(model_name, model_kwargs={"torch_dtype": torch.float32}, trust_remote_code=True, device=device)
        else:
            model = ColBERTWrapper(model_name, model_kwargs={"torch_dtype": torch.float32}, device=device)

        if hasattr(model.model[0].tokenizer, 'model_max_length'):
            model.model[0].tokenizer.model_input_names = ['input_ids', 'attention_mask']

        output_folder_name = os.path.basename(model_name)
        if os.path.isdir(model_name) and len(output_folder_name) > 100:
            model_hash = hashlib.md5(model_name.encode()).hexdigest()[:6]
            output_folder_name = f"{output_folder_name[:93]}_{model_hash}"

        if os.path.isdir(model_name):
            try:
                model.model_meta.name = output_folder_name
            except AttributeError:
                logger.warning("Could not override model_meta.name. Path might still be too long.")
        
        setproctitle(f"{output_folder_name}-{gpu_id}")
        print(f"Running tasks: {tasks} / {model_name} on GPU {gpu_id} in process {current_process().name}")
        evaluation = MTEB(
            tasks=get_tasks(tasks=tasks, languages=["kor-Kore", "kor-Hang", "kor_Hang"])
        )
        if "jinaai/jina-colbert-v2" in model_name:
            batch_size = 512
        else:
            batch_size = 32

        evaluation.run(
            model,
            output_folder=f"{save_path}/{output_folder_name}",
            encode_kwargs={"batch_size": batch_size},
        )
    except Exception as ex:
        print(ex)
        traceback.print_exc()

if __name__ == "__main__":
    torch.multiprocessing.set_start_method('spawn')
    
    for model_name in model_names:
        print(f"Starting evaluation for model: {model_name}")
        processes = []
        
        for gpu_id, tasks in TASK_LIST_RETRIEVAL_GPU_MAPPING.items():
            p = Process(target=evaluate_model, args=(model_name, gpu_id, tasks))
            p.start()
            processes.append(p)
        
        for p in processes:
            p.join()
        
        print(f"Completed evaluation for model: {model_name}")

For the MIRACLRetrieval and MrTidyRetrieval, my server could not handle the vectors, so I couldn't evaluate on those datasets.

For muvera evaluation, you can check the github:
https://github.com/yjoonjang/muvera-py/blob/feat/pylate/main_pylate.py

  • Youngjoon Jang
yjoonjang changed discussion status to closed

Sign up or log in to comment