#!/usr/bin/env python from huggingface_hub import snapshot_download from src.backend.envs import EVAL_REQUESTS_PATH_BACKEND from src.backend.manage_requests import get_eval_requests from src.backend.manage_requests import EvalRequest from src.backend.run_eval_suite import run_evaluation from lm_eval.tasks import initialize_tasks, include_task_folder from lm_eval import tasks, evaluator, utils from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT, Task from src.envs import QUEUE_REPO def main(): snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60) PENDING_STATUS = "PENDING" RUNNING_STATUS = "RUNNING" FINISHED_STATUS = "FINISHED" FAILED_STATUS = "FAILED" status = [PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS] # Get all eval request that are FINISHED, if you want to run other evals, change this parameter eval_requests: list[EvalRequest] = get_eval_requests(job_status=status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND) eval_request = [r for r in eval_requests if 'bloom-560m' in r.model][0] task_names = ['halueval_qa'] include_task_folder("src/backend/tasks/") initialize_tasks('INFO') print(tasks.ALL_TASKS) task_names = utils.pattern_match(task_names, tasks.ALL_TASKS) print(f"Selected Tasks: {task_names}") results = evaluator.simple_evaluate(model="hf-auto", model_args=eval_request.get_model_args(), tasks=task_names, num_fewshot=0, batch_size=4, device=DEVICE, use_cache=None, limit=8, write_out=True) print('AAA', results) if __name__ == "__main__": main()