File size: 5,753 Bytes
14e4843 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
#!/usr/bin/env python
import json
import os
import time
from datetime import datetime, timezone
from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO
from src.submission.check_validity import already_submitted_models, get_model_size, is_model_on_hub
from huggingface_hub import snapshot_download
from src.backend.envs import EVAL_REQUESTS_PATH_BACKEND
from src.backend.manage_requests import get_eval_requests
from src.backend.manage_requests import EvalRequest
def add_new_eval(model: str, base_model: str, revision: str, precision: str, private: bool, weight_type: str, model_type: str):
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
user_name = ""
model_path = model
if "/" in model:
tokens = model.split("/")
user_name = tokens[0]
model_path = tokens[1]
precision = precision.split(" ")[0]
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
if model_type is None or model_type == "":
return print("Please select a model type.")
# Does the model actually exist?
if revision == "":
revision = "main"
# Is the model on the hub?
if weight_type in ["Delta", "Adapter"]:
base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=True)
if not base_model_on_hub:
print(f'Base model "{base_model}" {error}')
return
if not weight_type == "Adapter":
model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)
if not model_on_hub:
print(f'Model "{model}" {error}')
return
# Is the model info correctly filled?
try:
model_info = API.model_info(repo_id=model, revision=revision)
except Exception:
print("Could not get your model information. Please fill it up properly.")
return
model_size = get_model_size(model_info=model_info, precision=precision)
license = 'none'
try:
license = model_info.cardData["license"]
except Exception:
print("Please select a license for your model")
# return
# modelcard_OK, error_msg = check_model_card(model)
# if not modelcard_OK:
# print(error_msg)
# return
# Seems good, creating the eval
print("Adding new eval")
eval_entry = {
"model": model,
"base_model": base_model,
"revision": revision,
"private": private,
"precision": precision,
"weight_type": weight_type,
"status": "PENDING",
"submitted_time": current_time,
"model_type": model_type,
"likes": model_info.likes,
"params": model_size,
"license": license,
}
# Check for duplicate submission
if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
print("This model has been already submitted.")
return
print("Creating eval file")
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
os.makedirs(OUT_DIR, exist_ok=True)
out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
with open(out_path, "w") as f:
f.write(json.dumps(eval_entry))
print("Uploading eval file")
API.upload_file(path_or_fileobj=out_path, path_in_repo=out_path.split("eval-queue/")[1],
repo_id=QUEUE_REPO, repo_type="dataset", commit_message=f"Add {model} to eval queue")
# Remove the local file
os.remove(out_path)
print("Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list.")
return
def main():
from huggingface_hub import HfApi
api = HfApi()
model_lst = api.list_models()
model_lst = [m for m in model_lst]
def custom_filter(m) -> bool:
# res = m.pipeline_tag in {'text-generation'} and 'en' in m.tags and m.private is False
# res = m.pipeline_tag in {'text-generation'} and 'en' in m.tags and m.private is False and 'mistralai/' in m.id
res = 'mistralai/' in m.id
return res
filtered_model_lst = sorted([m for m in model_lst if custom_filter(m)], key=lambda m: m.downloads, reverse=True)
snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
PENDING_STATUS = "PENDING"
RUNNING_STATUS = "RUNNING"
FINISHED_STATUS = "FINISHED"
FAILED_STATUS = "FAILED"
status = [PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS]
# Get all eval requests
eval_requests: list[EvalRequest] = get_eval_requests(job_status=status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
requested_model_names = {e.model for e in eval_requests}
# breakpoint()
for i in range(min(200, len(filtered_model_lst))):
model = filtered_model_lst[i]
print(f'Considering {model.id} ..')
is_finetuned = any(tag.startswith('base_model:') for tag in model.tags)
model_type = 'pretrained'
if is_finetuned:
model_type = "fine-tuned"
is_instruction_tuned = 'nstruct' in model.id
if is_instruction_tuned:
model_type = "instruction-tuned"
if model.id not in requested_model_names:
if 'mage' not in model.id:
add_new_eval(model=model.id, base_model='', revision='main', precision='float32', private=False, weight_type='Original', model_type=model_type)
time.sleep(10)
else:
print(f'Model {model.id} already added, not adding it to the queue again.')
if __name__ == "__main__":
main()
|