Spaces:
Running
on
Zero
Running
on
Zero
import json | |
import argparse | |
from icecream import ic | |
import os | |
import numpy as np | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--results_folder", type=str, default="./playground/data/eval/results") | |
parser.add_argument("--ckpt", type=str) | |
args = parser.parse_args() | |
scores = {} | |
dirs = os.listdir(f"{args.results_folder}/{args.ckpt}") | |
for dir in dirs: | |
if args.ckpt in dir and dir not in args.ckpt: | |
break | |
try: | |
with open(f"{args.results_folder}/{args.ckpt}/mmstar/merge_score.json", "r") as f: | |
data = json.load(f) | |
scores["MMStar"] = round(data.get("final score", 0)*100, 1) if data.get("final score") is not None else None | |
except: | |
scores["MMStar"] = None | |
cv_scores = {} | |
with open(f"{args.results_folder}/{args.ckpt}/cv-bench/merge_score.json", "r") as f: | |
data = json.load(f) | |
scores["CV-Bench"] = round(data.get("Overall", 0)*100, 1) if data.get("Overall") is not None else None | |
cv_scores["CV-Bench (2D)"] = round(data.get("2D", 0)*100, 1) if data.get("2D") is not None else None | |
cv_scores["CV-Bench (3D)"] = round(data.get("3D", 0)*100, 1) if data.get("3D") is not None else None | |
cv_scores["CV-Bench (Count)"] = round(data.get("Count", 0)*100, 1) if data.get("Count") is not None else None | |
cv_scores["CV-Bench (Depth)"] = round(data.get("Depth", 0)*100, 1) if data.get("Depth") is not None else None | |
cv_scores["CV-Bench (Relation)"] = round(data.get("Relation", 0)*100, 1) if data.get("Relation") is not None else None | |
cv_scores["CV-Bench (Distance)"] = round(data.get("Distance", 0)*100, 1) if data.get("Distance") is not None else None | |
with open(f"{args.results_folder}/{args.ckpt}/{dir}/results.json", "r") as f: | |
results = json.load(f).get("results", {}) | |
# scores["MME-Cognition"] = round(results.get("mme", {}).get("mme_cognition_score,none", 0), 1) if results.get("mme", {}).get("mme_cognition_score,none") is not None else None | |
# scores["MME-Perception"] = round(results.get("mme", {}).get("mme_percetion_score,none", 0), 1) if results.get("mme", {}).get("mme_percetion_score,none") is not None else None | |
scores["Realworld-QA"] = round(results.get("realworldqa", {}).get("exact_match,flexible-extract", 0)*100, 1) if results.get("realworldqa", {}).get("exact_match,flexible-extract") is not None else None | |
scores["VizWiz-VQA-Val"] = round(results.get("vizwiz_vqa_val", {}).get("exact_match,none", 0)*100, 1) if results.get("vizwiz_vqa_val", {}).get("exact_match,none") is not None else None | |
# scores["SEEDBench-Image"] = round(results.get("seedbench", {}).get("seed_image,none", 0)*100, 1) if results.get("seedbench", {}).get("seed_image,none") is not None else None | |
# scores["VQAv2-Val"] = round(results.get("vqav2_val", {}).get("exact_match,none", 0)*100, 1) if results.get("vqav2_val", {}).get("exact_match,none") is not None else None | |
# scores["Science-QA-Img"] = round(results.get("scienceqa_img", {}).get("exact_match,none", 0)*100, 1) if results.get("scienceqa_img", {}).get("exact_match,none") is not None else None | |
scores["MMMU-Val"] = round(results.get("mmmu_val", {}).get("mmmu_acc,none", 0)*100, 1) if results.get("mmmu_val", {}).get("mmmu_acc,none") is not None else None | |
# scores["MMBench"] = round(results.get("mmbench_en_dev", {}).get("gpt_eval_score,none", 0), 1) if results.get("mmbench_en_dev", {}).get("gpt_eval_score,none") is not None else None | |
# scores["NaturalBench"] = round(results.get("naturalbench", {}).get("mme_score,none", 0)*100, 1) if results.get("naturalbench", {}).get("mme_score,none") is not None else None | |
# scores["GQA"] = round(results.get("gqa", {}).get("exact_match,none", 0)*100, 1) if results.get("gqa", {}).get("exact_match,none") is not None else None | |
scores["POPE"] = round(results.get("pope", {}).get("pope_accuracy,none", 0)*100, 1) if results.get("pope", {}).get("pope_accuracy,none") is not None else None | |
scores["MMVet"] = round(results.get("mmvet", {}).get("gpt_eval_score", 0)*100, 1) if results.get("mmvet", {}).get("gpt_eval_score") is not None else None | |
scores["OK-VQA"] = round(results.get("ok_vqa", {}).get("exact_match,none", 0)*100, 1) if results.get("ok_vqa", {}).get("exact_match,none") is not None else None | |
# scores["ChartQA"] = round(results.get("chartqa", {}).get("relaxed_overall,none", 0)*100, 1) if results.get("chartqa", {}).get("relaxed_overall,none") is not None else None | |
# scores["DocVQA"] = round(results.get("docvqa_val", {}).get("anls,none", 0)*100, 1) if results.get("docvqa_val", {}).get("anls,none") is not None else None | |
# scores["TextVQA"] = round(results.get("textvqa_val", {}).get("exact_match,none", 0)*100, 1) if results.get("textvqa_val", {}).get("exact_match,none") is not None else None | |
try: | |
with open(f"{args.results_folder}/{args.ckpt}/mmvp/merge_score.json", "r") as f: | |
data = json.load(f) | |
scores["MMVP"] = round(data.get("mmvp", 0)*100, 1) if data.get("mmvp") is not None else None | |
except: | |
scores["MMVP"] = None | |
keys = list(scores.keys()) | |
str_scores = [str(scores[key]) if scores[key] is not None else 'None' for key in keys] | |
abl_keys = ["CV-Bench", "MMStar", "VizWiz-VQA-Val", "MMVet", "MMVP", "MMMU-Val"] | |
abl_scores = [scores[key] for key in abl_keys if scores[key] is not None] | |
small_abl_keys = ["CV-Bench", "MMStar", "OK-VQA", "MMMU-Val"] | |
small_abl_scores = [scores[key] for key in small_abl_keys if scores[key] is not None] | |
cv_bench_keys = ["CV-Bench (2D)", "CV-Bench (3D)", "CV-Bench (Count)", "CV-Bench (Depth)", "CV-Bench (Relation)", "CV-Bench (Distance)"] | |
cv_bench_scores = [cv_scores[key] for key in cv_bench_keys if cv_scores[key] is not None] | |
# cat_scores = {} | |
# if os.path.exists(f"{args.results_folder}/{args.ckpt}/categorized_scores.json"): | |
# with open(f"{args.results_folder}/{args.ckpt}/categorized_scores.json", "r") as f: | |
# cat_scores = json.load(f) | |
# cat_scores.pop("Both") | |
print("\n====================All-Scores===========================================") | |
print(" & ".join(keys)) | |
print(" & ".join(str_scores)) | |
if abl_scores: | |
print("\n====================Abl-Scores===========================================") | |
print(" & ".join(abl_keys)) | |
print(" & ".join([str(a) for a in abl_scores])) | |
print(f"Ablation Avg: {round(np.mean(abl_scores), 1)}") | |
else: | |
print("Ablation Avg: None") | |
if small_abl_scores: | |
print("\n====================Small-Abl-Scores===========================================") | |
print(" & ".join(small_abl_keys)) | |
print(" & ".join([str(a) for a in small_abl_scores])) | |
print(f"Small-Ablation Avg: {round(np.mean(small_abl_scores), 1)}") | |
else: | |
print("Small-Ablation Avg: None") | |
if cv_bench_scores: | |
print("\n====================CV-Bench-Scores===========================================") | |
print(" & ".join(cv_bench_keys)) | |
print(" & ".join([str(c) for c in cv_bench_scores])) | |
print(f"CV-Bench Overall: {round(np.mean(cv_bench_scores[:2]), 1)}") | |
else: | |
print("CV-Bench Avg: None") | |
# if cat_scores is not None: | |
# print("\n====================Categorized-Scores===========================================") | |
# cats = [] | |
# class_scores = [] | |
# benches = [] | |
# for k, v in cat_scores.items(): | |
# cats.append(k) | |
# for bench, score in v.items(): | |
# benches.append(bench) | |
# class_scores.append(round(score*100, 1)) | |
# print(" & ".join(cats)) | |
# print(" & ".join(benches)) | |
# print(" & ".join([str(c) for c in class_scores])) | |
# print("================================================================") | |