Spaces:

TIGER-Lab
/

GenAI-Arena

Running on Zero

App Files Files Community

GenAI-Arena / arena_elo /elo_rating /clean_battle_data.py

DongfuJiang

update model registry

0b4b1e4 4 months ago

raw

history blame

12 kB

	"""
	Clean chatbot arena battle log.

	Usage:
	python3 clean_battle_data.py --mode conv_release
	"""
	import argparse
	import datetime
	import json
	import os
	import sys
	from pytz import timezone
	import time
	import PIL
	from PIL import ImageFile
	ImageFile.LOAD_TRUNCATED_IMAGES = True

	from tqdm import tqdm

	from .basic_stats import get_log_files, NUM_SERVERS, LOG_ROOT_DIR
	from .utils import detect_language, get_time_stamp_from_date, get_model_info

	VOTES = ["tievote", "leftvote", "rightvote", "bothbad_vote"]

	def parse_model_name(model_name):
	return NotImplementedError()
	return model_source, model_name, model_type

	def remove_html(raw):
	if raw.startswith("<h3>"):
	return raw[raw.find(": ") + 2 : -len("</h3>\n")]
	if raw.startswith("### Model A: ") or raw.startswith("### Model B: "):
	return raw[13:]
	return raw


	def to_openai_format(messages):
	roles = ["user", "assistant"]
	ret = []
	for i, x in enumerate(messages):
	ret.append({"role": roles[i % 2], "content": x[1]})
	return ret


	def replace_model_name(old_name, tstamp):
	replace_dict = {
	"PlayGroundV2": "PlayGround V2",
	"PlayGroundV2.5": "PlayGround V2.5",
	"FluxTimestep": "FLUX1schnell",
	"FluxGuidance": "FLUX1dev"
	}
	if old_name in replace_dict:
	old_name = replace_dict[old_name]
	if "Flux" in old_name:
	print(f"Invalid model names: {old_name}")
	exit(1)
	model_info = get_model_info(old_name)
	old_name = model_info.simple_name
	return old_name


	def read_file(filename):
	data = []
	for retry in range(5):
	try:
	# lines = open(filename).readlines()
	for l in open(filename):
	row = json.loads(l)
	if row["type"] in VOTES:
	data.append(row)
	break
	except FileNotFoundError:
	time.sleep(2)
	except json.JSONDecodeError:
	print(f"Error in reading {filename}")
	print(row)
	exit(0)
	return data


	def read_file_parallel(log_files, num_threads=16):
	data_all = []
	if num_threads == 1:
	for log_file in tqdm(log_files, desc="Reading"):
	data_all.extend(read_file(log_file))
	return data_all
	else:
	from multiprocessing import Pool

	with Pool(num_threads) as p:
	ret_all = list(tqdm(p.imap(read_file, log_files), total=len(log_files)))
	for ret in ret_all:
	data_all.extend(ret)
	return data_all

	def load_image(image_path):
	try:
	return PIL.Image.open(image_path)
	except:
	return None

	def clean_battle_data(
	log_files, exclude_model_names, ban_ip_list=None, sanitize_ip=False, mode="simple", task_name="image_editing"
	):
	data = read_file_parallel(log_files, num_threads=1)

	convert_type = {
	"leftvote": "model_a",
	"rightvote": "model_b",
	"tievote": "tie",
	"bothbad_vote": "tie (bothbad)",
	}

	all_models = set()
	all_ips = dict()
	ct_anony = 0
	ct_invalid = 0
	ct_leaked_identity = 0
	ct_banned = 0
	battles = []
	for row in tqdm(data, desc="Cleaning"):
	if row["models"][0] is None or row["models"][1] is None:
	print(f"Invalid model names: {row['models']}")
	continue

	# Resolve model names
	models_public = [remove_html(row["models"][0]), remove_html(row["models"][1])]
	if "model_name" in row["states"][0]:
	models_hidden = [
	row["states"][0]["model_name"],
	row["states"][1]["model_name"],
	]
	if models_hidden[0] is None:
	models_hidden = models_public
	else:
	models_hidden = models_public

	if (models_public[0] == "" and models_public[1] != "") or (
	models_public[1] == "" and models_public[0] != ""
	):
	ct_invalid += 1
	print(f"Invalid model names: {models_public}")
	continue

	if models_public[0] == "" or models_public[0] == "Model A":
	anony = True
	models = models_hidden
	ct_anony += 1
	else:
	anony = False
	models = models_public
	if not models_public == models_hidden:
	print(f"Model names mismatch: {models_public} vs {models_hidden}")
	ct_invalid += 1
	continue

	def preprocess_model_name(m):
	if m == "Playground v2":
	return 'playground_PlayGroundV2_generation'
	if m == "Playground v2.5":
	return 'playground_PlayGroundV2.5_generation'
	return m
	models = [preprocess_model_name(m) for m in models]

	# Replace bard with palm
	if task_name == "image_editing":
	valid = True
	for _model in models:
	try:
	platform, model_name, task = _model.split("_")
	except ValueError:
	valid = False
	break
	if not (platform in ["playground", "imagenhub"] and task == "edition"):
	valid = False
	break
	if not valid:
	ct_invalid += 1
	continue
	for i, _model in enumerate(models):
	platform, model_name, task = _model.split("_")
	models[i] = model_name

	elif task_name == "t2i_generation":
	valid = True
	for _model in models:
	try:
	platform, model_name, task = _model.split("_")
	except ValueError:
	valid = False
	break
	if not (platform.lower() in ["playground", "imagenhub", 'fal'] and (task == "generation" or task == "text2image")):
	valid = False
	break
	if not valid:
	ct_invalid += 1
	continue
	for i, _model in enumerate(models):
	platform, model_name, task = _model.split("_")
	models[i] = model_name

	elif task_name == "video_generation":
	valid = True
	for _model in models:
	try:
	platform, model_name, task = _model.split("_")
	except ValueError:
	valid = False
	break
	if not (platform in ["videogenhub", "fal"] and task == "generation" or task == "text2video"):
	valid = False
	break
	if not valid:
	ct_invalid += 1
	continue
	for i, _model in enumerate(models):
	platform, model_name, task = _model.split("_")
	models[i] = model_name

	else:
	raise ValueError(f"Invalid task_name: {task_name}")

	models = [replace_model_name(m, row["tstamp"]) for m in models]

	# Exclude certain models
	if exclude_model_names and any(x in exclude_model_names for x in models):
	ct_invalid += 1
	continue

	if mode == "conv_release":
	# assert the two images are the same
	date = datetime.datetime.fromtimestamp(row["tstamp"], tz=timezone("US/Pacific")).strftime("%Y-%m-%d") # 2024-02-29
	image_path_format = f"{LOG_ROOT_DIR}/{date}-convinput_images/input_image_"
	image_path_0 = image_path_format + str(row["states"][0]["conv_id"]) + ".png"
	image_path_1 = image_path_format + str(row["states"][1]["conv_id"]) + ".png"
	if not os.path.exists(image_path_0) or not os.path.exists(image_path_1):
	print(f"Image not found for {image_path_0} or {image_path_1}")
	ct_invalid += 1
	continue

	image_0 = load_image(image_path_0)
	image_1 = load_image(image_path_1)
	if image_0 is None or image_1 is None:
	print(f"Image not found for {image_path_0} or {image_path_1}")
	ct_invalid += 1
	continue
	if image_0.tobytes() != image_1.tobytes():
	print(f"Image not the same for {image_path_0} and {image_path_1}")
	ct_invalid += 1
	continue


	question_id = row["states"][0]["conv_id"]

	ip = row["ip"]
	if ip not in all_ips:
	all_ips[ip] = {"ip": ip, "count": 0, "sanitized_id": len(all_ips)}
	all_ips[ip]["count"] += 1
	if sanitize_ip:
	user_id = f"arena_user_{all_ips[ip]['sanitized_id']}"
	else:
	user_id = f"{all_ips[ip]['ip']}"

	if ban_ip_list is not None and ip in ban_ip_list:
	ct_banned += 1
	print(f"User {user_id} is banned")
	continue

	# Save the results
	battles.append(
	dict(
	question_id=question_id,
	model_a=models[0],
	model_b=models[1],
	winner=convert_type[row["type"]],
	judge=f"arena_user_{user_id}",
	anony=anony,
	tstamp=row["tstamp"],
	)
	)

	all_models.update(models_hidden)
	battles.sort(key=lambda x: x["tstamp"])
	last_updated_tstamp = battles[-1]["tstamp"]

	last_updated_datetime = datetime.datetime.fromtimestamp(
	last_updated_tstamp, tz=timezone("US/Pacific")
	).strftime("%Y-%m-%d %H:%M:%S %Z")

	print(
	f"#votes: {len(data)}, #invalid votes: {ct_invalid}, "
	f"#leaked_identity: {ct_leaked_identity} "
	f"#banned: {ct_banned} "
	)
	print(f"#battles: {len(battles)}, #anony: {ct_anony}")
	print(f"#models: {len(all_models)}, {all_models}")
	print(f"last-updated: {last_updated_datetime}")

	if ban_ip_list is not None:
	for ban_ip in ban_ip_list:
	if ban_ip in all_ips:
	del all_ips[ban_ip]
	print("Top 30 IPs:")
	print(sorted(all_ips.values(), key=lambda x: x["count"], reverse=True)[:30])
	return battles


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--max-num-files", type=int)
	parser.add_argument(
	"--mode", type=str, choices=["simple", "conv_release"], default="simple"
	)
	parser.add_argument("--task_name", type=str, default="image_editing", choices=["image_editing", "t2i_generation", "video_generation"])
	parser.add_argument("--exclude-model-names", type=str, nargs="+")
	parser.add_argument("--ban-ip-file", type=str)
	parser.add_argument("--sanitize-ip", action="store_true", default=False)
	args = parser.parse_args()

	log_files = get_log_files(args.max_num_files)
	ban_ip_list = json.load(open(args.ban_ip_file)) if args.ban_ip_file else None

	battles = clean_battle_data(
	log_files, args.exclude_model_names or [], ban_ip_list, args.sanitize_ip, args.mode, args.task_name
	)
	last_updated_tstamp = battles[-1]["tstamp"]
	cutoff_date = datetime.datetime.fromtimestamp(
	last_updated_tstamp, tz=timezone("US/Pacific")
	).strftime("%Y%m%d")

	if args.mode == "simple":
	for x in battles:
	for key in [
	"conversation_a",
	"conversation_b",
	"question_id",
	]:
	if key in x:
	del x[key]
	print("Samples:")
	for i in range(min(4, len(battles))):
	print(battles[i])
	output = f"clean_battle_{args.task_name}_{cutoff_date}.json"
	elif args.mode == "conv_release":
	output = f"clean_battle_{args.task_name}_conv_{cutoff_date}.json"

	with open(output, "w") as fout:
	json.dump(battles, fout, indent=2, ensure_ascii=False)
	print(f"Write cleaned data to {output}")

	with open("cut_off_date.txt", "w") as fout:
	fout.write(cutoff_date)