Spaces:

shi-labs
/

OLA-VLM

Running on Zero

App Files Files Community

OLA-VLM / ola_vlm /eval /get_all_stats.py

praeclarumjj3

:zap: add code

9fa3d89 11 days ago

raw

history blame

8.04 kB

	import json
	import argparse
	from icecream import ic
	import os
	import numpy as np


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--results_folder", type=str, default="./playground/data/eval/results")
	parser.add_argument("--ckpt", type=str)
	args = parser.parse_args()

	scores = {}

	dirs = os.listdir(f"{args.results_folder}/{args.ckpt}")
	for dir in dirs:
	if args.ckpt in dir and dir not in args.ckpt:
	break


	try:
	with open(f"{args.results_folder}/{args.ckpt}/mmstar/merge_score.json", "r") as f:
	data = json.load(f)
	scores["MMStar"] = round(data.get("final score", 0)*100, 1) if data.get("final score") is not None else None
	except:
	scores["MMStar"] = None

	cv_scores = {}

	with open(f"{args.results_folder}/{args.ckpt}/cv-bench/merge_score.json", "r") as f:
	data = json.load(f)
	scores["CV-Bench"] = round(data.get("Overall", 0)*100, 1) if data.get("Overall") is not None else None
	cv_scores["CV-Bench (2D)"] = round(data.get("2D", 0)*100, 1) if data.get("2D") is not None else None
	cv_scores["CV-Bench (3D)"] = round(data.get("3D", 0)*100, 1) if data.get("3D") is not None else None
	cv_scores["CV-Bench (Count)"] = round(data.get("Count", 0)*100, 1) if data.get("Count") is not None else None
	cv_scores["CV-Bench (Depth)"] = round(data.get("Depth", 0)*100, 1) if data.get("Depth") is not None else None
	cv_scores["CV-Bench (Relation)"] = round(data.get("Relation", 0)*100, 1) if data.get("Relation") is not None else None
	cv_scores["CV-Bench (Distance)"] = round(data.get("Distance", 0)*100, 1) if data.get("Distance") is not None else None


	with open(f"{args.results_folder}/{args.ckpt}/{dir}/results.json", "r") as f:
	results = json.load(f).get("results", {})
	# scores["MME-Cognition"] = round(results.get("mme", {}).get("mme_cognition_score,none", 0), 1) if results.get("mme", {}).get("mme_cognition_score,none") is not None else None
	# scores["MME-Perception"] = round(results.get("mme", {}).get("mme_percetion_score,none", 0), 1) if results.get("mme", {}).get("mme_percetion_score,none") is not None else None

	scores["Realworld-QA"] = round(results.get("realworldqa", {}).get("exact_match,flexible-extract", 0)*100, 1) if results.get("realworldqa", {}).get("exact_match,flexible-extract") is not None else None
	scores["VizWiz-VQA-Val"] = round(results.get("vizwiz_vqa_val", {}).get("exact_match,none", 0)*100, 1) if results.get("vizwiz_vqa_val", {}).get("exact_match,none") is not None else None
	# scores["SEEDBench-Image"] = round(results.get("seedbench", {}).get("seed_image,none", 0)*100, 1) if results.get("seedbench", {}).get("seed_image,none") is not None else None
	# scores["VQAv2-Val"] = round(results.get("vqav2_val", {}).get("exact_match,none", 0)*100, 1) if results.get("vqav2_val", {}).get("exact_match,none") is not None else None

	# scores["Science-QA-Img"] = round(results.get("scienceqa_img", {}).get("exact_match,none", 0)*100, 1) if results.get("scienceqa_img", {}).get("exact_match,none") is not None else None
	scores["MMMU-Val"] = round(results.get("mmmu_val", {}).get("mmmu_acc,none", 0)*100, 1) if results.get("mmmu_val", {}).get("mmmu_acc,none") is not None else None
	# scores["MMBench"] = round(results.get("mmbench_en_dev", {}).get("gpt_eval_score,none", 0), 1) if results.get("mmbench_en_dev", {}).get("gpt_eval_score,none") is not None else None

	# scores["NaturalBench"] = round(results.get("naturalbench", {}).get("mme_score,none", 0)*100, 1) if results.get("naturalbench", {}).get("mme_score,none") is not None else None

	# scores["GQA"] = round(results.get("gqa", {}).get("exact_match,none", 0)*100, 1) if results.get("gqa", {}).get("exact_match,none") is not None else None
	scores["POPE"] = round(results.get("pope", {}).get("pope_accuracy,none", 0)*100, 1) if results.get("pope", {}).get("pope_accuracy,none") is not None else None
	scores["MMVet"] = round(results.get("mmvet", {}).get("gpt_eval_score", 0)*100, 1) if results.get("mmvet", {}).get("gpt_eval_score") is not None else None
	scores["OK-VQA"] = round(results.get("ok_vqa", {}).get("exact_match,none", 0)*100, 1) if results.get("ok_vqa", {}).get("exact_match,none") is not None else None
	# scores["ChartQA"] = round(results.get("chartqa", {}).get("relaxed_overall,none", 0)*100, 1) if results.get("chartqa", {}).get("relaxed_overall,none") is not None else None
	# scores["DocVQA"] = round(results.get("docvqa_val", {}).get("anls,none", 0)*100, 1) if results.get("docvqa_val", {}).get("anls,none") is not None else None
	# scores["TextVQA"] = round(results.get("textvqa_val", {}).get("exact_match,none", 0)*100, 1) if results.get("textvqa_val", {}).get("exact_match,none") is not None else None

	try:
	with open(f"{args.results_folder}/{args.ckpt}/mmvp/merge_score.json", "r") as f:
	data = json.load(f)
	scores["MMVP"] = round(data.get("mmvp", 0)*100, 1) if data.get("mmvp") is not None else None
	except:
	scores["MMVP"] = None

	keys = list(scores.keys())
	str_scores = [str(scores[key]) if scores[key] is not None else 'None' for key in keys]

	abl_keys = ["CV-Bench", "MMStar", "VizWiz-VQA-Val", "MMVet", "MMVP", "MMMU-Val"]

	abl_scores = [scores[key] for key in abl_keys if scores[key] is not None]

	small_abl_keys = ["CV-Bench", "MMStar", "OK-VQA", "MMMU-Val"]
	small_abl_scores = [scores[key] for key in small_abl_keys if scores[key] is not None]

	cv_bench_keys = ["CV-Bench (2D)", "CV-Bench (3D)", "CV-Bench (Count)", "CV-Bench (Depth)", "CV-Bench (Relation)", "CV-Bench (Distance)"]
	cv_bench_scores = [cv_scores[key] for key in cv_bench_keys if cv_scores[key] is not None]

	# cat_scores = {}
	# if os.path.exists(f"{args.results_folder}/{args.ckpt}/categorized_scores.json"):
	# with open(f"{args.results_folder}/{args.ckpt}/categorized_scores.json", "r") as f:
	# cat_scores = json.load(f)
	# cat_scores.pop("Both")

	print("\n====================All-Scores===========================================")
	print(" & ".join(keys))
	print(" & ".join(str_scores))
	if abl_scores:
	print("\n====================Abl-Scores===========================================")
	print(" & ".join(abl_keys))
	print(" & ".join([str(a) for a in abl_scores]))
	print(f"Ablation Avg: {round(np.mean(abl_scores), 1)}")
	else:
	print("Ablation Avg: None")

	if small_abl_scores:
	print("\n====================Small-Abl-Scores===========================================")
	print(" & ".join(small_abl_keys))
	print(" & ".join([str(a) for a in small_abl_scores]))
	print(f"Small-Ablation Avg: {round(np.mean(small_abl_scores), 1)}")
	else:
	print("Small-Ablation Avg: None")

	if cv_bench_scores:
	print("\n====================CV-Bench-Scores===========================================")
	print(" & ".join(cv_bench_keys))
	print(" & ".join([str(c) for c in cv_bench_scores]))
	print(f"CV-Bench Overall: {round(np.mean(cv_bench_scores[:2]), 1)}")
	else:
	print("CV-Bench Avg: None")

	# if cat_scores is not None:
	# print("\n====================Categorized-Scores===========================================")
	# cats = []
	# class_scores = []
	# benches = []
	# for k, v in cat_scores.items():
	# cats.append(k)
	# for bench, score in v.items():
	# benches.append(bench)
	# class_scores.append(round(score*100, 1))
	# print(" & ".join(cats))
	# print(" & ".join(benches))
	# print(" & ".join([str(c) for c in class_scores]))
	# print("================================================================")