OLA-VLM / ola_vlm /eval /get_all_stats.py
praeclarumjj3's picture
:zap: add code
9fa3d89
raw
history blame
8.04 kB
import json
import argparse
from icecream import ic
import os
import numpy as np
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--results_folder", type=str, default="./playground/data/eval/results")
parser.add_argument("--ckpt", type=str)
args = parser.parse_args()
scores = {}
dirs = os.listdir(f"{args.results_folder}/{args.ckpt}")
for dir in dirs:
if args.ckpt in dir and dir not in args.ckpt:
break
try:
with open(f"{args.results_folder}/{args.ckpt}/mmstar/merge_score.json", "r") as f:
data = json.load(f)
scores["MMStar"] = round(data.get("final score", 0)*100, 1) if data.get("final score") is not None else None
except:
scores["MMStar"] = None
cv_scores = {}
with open(f"{args.results_folder}/{args.ckpt}/cv-bench/merge_score.json", "r") as f:
data = json.load(f)
scores["CV-Bench"] = round(data.get("Overall", 0)*100, 1) if data.get("Overall") is not None else None
cv_scores["CV-Bench (2D)"] = round(data.get("2D", 0)*100, 1) if data.get("2D") is not None else None
cv_scores["CV-Bench (3D)"] = round(data.get("3D", 0)*100, 1) if data.get("3D") is not None else None
cv_scores["CV-Bench (Count)"] = round(data.get("Count", 0)*100, 1) if data.get("Count") is not None else None
cv_scores["CV-Bench (Depth)"] = round(data.get("Depth", 0)*100, 1) if data.get("Depth") is not None else None
cv_scores["CV-Bench (Relation)"] = round(data.get("Relation", 0)*100, 1) if data.get("Relation") is not None else None
cv_scores["CV-Bench (Distance)"] = round(data.get("Distance", 0)*100, 1) if data.get("Distance") is not None else None
with open(f"{args.results_folder}/{args.ckpt}/{dir}/results.json", "r") as f:
results = json.load(f).get("results", {})
# scores["MME-Cognition"] = round(results.get("mme", {}).get("mme_cognition_score,none", 0), 1) if results.get("mme", {}).get("mme_cognition_score,none") is not None else None
# scores["MME-Perception"] = round(results.get("mme", {}).get("mme_percetion_score,none", 0), 1) if results.get("mme", {}).get("mme_percetion_score,none") is not None else None
scores["Realworld-QA"] = round(results.get("realworldqa", {}).get("exact_match,flexible-extract", 0)*100, 1) if results.get("realworldqa", {}).get("exact_match,flexible-extract") is not None else None
scores["VizWiz-VQA-Val"] = round(results.get("vizwiz_vqa_val", {}).get("exact_match,none", 0)*100, 1) if results.get("vizwiz_vqa_val", {}).get("exact_match,none") is not None else None
# scores["SEEDBench-Image"] = round(results.get("seedbench", {}).get("seed_image,none", 0)*100, 1) if results.get("seedbench", {}).get("seed_image,none") is not None else None
# scores["VQAv2-Val"] = round(results.get("vqav2_val", {}).get("exact_match,none", 0)*100, 1) if results.get("vqav2_val", {}).get("exact_match,none") is not None else None
# scores["Science-QA-Img"] = round(results.get("scienceqa_img", {}).get("exact_match,none", 0)*100, 1) if results.get("scienceqa_img", {}).get("exact_match,none") is not None else None
scores["MMMU-Val"] = round(results.get("mmmu_val", {}).get("mmmu_acc,none", 0)*100, 1) if results.get("mmmu_val", {}).get("mmmu_acc,none") is not None else None
# scores["MMBench"] = round(results.get("mmbench_en_dev", {}).get("gpt_eval_score,none", 0), 1) if results.get("mmbench_en_dev", {}).get("gpt_eval_score,none") is not None else None
# scores["NaturalBench"] = round(results.get("naturalbench", {}).get("mme_score,none", 0)*100, 1) if results.get("naturalbench", {}).get("mme_score,none") is not None else None
# scores["GQA"] = round(results.get("gqa", {}).get("exact_match,none", 0)*100, 1) if results.get("gqa", {}).get("exact_match,none") is not None else None
scores["POPE"] = round(results.get("pope", {}).get("pope_accuracy,none", 0)*100, 1) if results.get("pope", {}).get("pope_accuracy,none") is not None else None
scores["MMVet"] = round(results.get("mmvet", {}).get("gpt_eval_score", 0)*100, 1) if results.get("mmvet", {}).get("gpt_eval_score") is not None else None
scores["OK-VQA"] = round(results.get("ok_vqa", {}).get("exact_match,none", 0)*100, 1) if results.get("ok_vqa", {}).get("exact_match,none") is not None else None
# scores["ChartQA"] = round(results.get("chartqa", {}).get("relaxed_overall,none", 0)*100, 1) if results.get("chartqa", {}).get("relaxed_overall,none") is not None else None
# scores["DocVQA"] = round(results.get("docvqa_val", {}).get("anls,none", 0)*100, 1) if results.get("docvqa_val", {}).get("anls,none") is not None else None
# scores["TextVQA"] = round(results.get("textvqa_val", {}).get("exact_match,none", 0)*100, 1) if results.get("textvqa_val", {}).get("exact_match,none") is not None else None
try:
with open(f"{args.results_folder}/{args.ckpt}/mmvp/merge_score.json", "r") as f:
data = json.load(f)
scores["MMVP"] = round(data.get("mmvp", 0)*100, 1) if data.get("mmvp") is not None else None
except:
scores["MMVP"] = None
keys = list(scores.keys())
str_scores = [str(scores[key]) if scores[key] is not None else 'None' for key in keys]
abl_keys = ["CV-Bench", "MMStar", "VizWiz-VQA-Val", "MMVet", "MMVP", "MMMU-Val"]
abl_scores = [scores[key] for key in abl_keys if scores[key] is not None]
small_abl_keys = ["CV-Bench", "MMStar", "OK-VQA", "MMMU-Val"]
small_abl_scores = [scores[key] for key in small_abl_keys if scores[key] is not None]
cv_bench_keys = ["CV-Bench (2D)", "CV-Bench (3D)", "CV-Bench (Count)", "CV-Bench (Depth)", "CV-Bench (Relation)", "CV-Bench (Distance)"]
cv_bench_scores = [cv_scores[key] for key in cv_bench_keys if cv_scores[key] is not None]
# cat_scores = {}
# if os.path.exists(f"{args.results_folder}/{args.ckpt}/categorized_scores.json"):
# with open(f"{args.results_folder}/{args.ckpt}/categorized_scores.json", "r") as f:
# cat_scores = json.load(f)
# cat_scores.pop("Both")
print("\n====================All-Scores===========================================")
print(" & ".join(keys))
print(" & ".join(str_scores))
if abl_scores:
print("\n====================Abl-Scores===========================================")
print(" & ".join(abl_keys))
print(" & ".join([str(a) for a in abl_scores]))
print(f"Ablation Avg: {round(np.mean(abl_scores), 1)}")
else:
print("Ablation Avg: None")
if small_abl_scores:
print("\n====================Small-Abl-Scores===========================================")
print(" & ".join(small_abl_keys))
print(" & ".join([str(a) for a in small_abl_scores]))
print(f"Small-Ablation Avg: {round(np.mean(small_abl_scores), 1)}")
else:
print("Small-Ablation Avg: None")
if cv_bench_scores:
print("\n====================CV-Bench-Scores===========================================")
print(" & ".join(cv_bench_keys))
print(" & ".join([str(c) for c in cv_bench_scores]))
print(f"CV-Bench Overall: {round(np.mean(cv_bench_scores[:2]), 1)}")
else:
print("CV-Bench Avg: None")
# if cat_scores is not None:
# print("\n====================Categorized-Scores===========================================")
# cats = []
# class_scores = []
# benches = []
# for k, v in cat_scores.items():
# cats.append(k)
# for bench, score in v.items():
# benches.append(bench)
# class_scores.append(round(score*100, 1))
# print(" & ".join(cats))
# print(" & ".join(benches))
# print(" & ".join([str(c) for c in class_scores]))
# print("================================================================")