import json import argparse from icecream import ic import os import numpy as np if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--results_folder", type=str, default="./playground/data/eval/results") parser.add_argument("--ckpt", type=str) args = parser.parse_args() scores = {} dirs = os.listdir(f"{args.results_folder}/{args.ckpt}") for dir in dirs: if args.ckpt in dir and dir not in args.ckpt: break try: with open(f"{args.results_folder}/{args.ckpt}/mmstar/merge_score.json", "r") as f: data = json.load(f) scores["MMStar"] = round(data.get("final score", 0)*100, 1) if data.get("final score") is not None else None except: scores["MMStar"] = None cv_scores = {} with open(f"{args.results_folder}/{args.ckpt}/cv-bench/merge_score.json", "r") as f: data = json.load(f) scores["CV-Bench"] = round(data.get("Overall", 0)*100, 1) if data.get("Overall") is not None else None cv_scores["CV-Bench (2D)"] = round(data.get("2D", 0)*100, 1) if data.get("2D") is not None else None cv_scores["CV-Bench (3D)"] = round(data.get("3D", 0)*100, 1) if data.get("3D") is not None else None cv_scores["CV-Bench (Count)"] = round(data.get("Count", 0)*100, 1) if data.get("Count") is not None else None cv_scores["CV-Bench (Depth)"] = round(data.get("Depth", 0)*100, 1) if data.get("Depth") is not None else None cv_scores["CV-Bench (Relation)"] = round(data.get("Relation", 0)*100, 1) if data.get("Relation") is not None else None cv_scores["CV-Bench (Distance)"] = round(data.get("Distance", 0)*100, 1) if data.get("Distance") is not None else None with open(f"{args.results_folder}/{args.ckpt}/{dir}/results.json", "r") as f: results = json.load(f).get("results", {}) # scores["MME-Cognition"] = round(results.get("mme", {}).get("mme_cognition_score,none", 0), 1) if results.get("mme", {}).get("mme_cognition_score,none") is not None else None # scores["MME-Perception"] = round(results.get("mme", {}).get("mme_percetion_score,none", 0), 1) if results.get("mme", {}).get("mme_percetion_score,none") is not None else None scores["Realworld-QA"] = round(results.get("realworldqa", {}).get("exact_match,flexible-extract", 0)*100, 1) if results.get("realworldqa", {}).get("exact_match,flexible-extract") is not None else None scores["VizWiz-VQA-Val"] = round(results.get("vizwiz_vqa_val", {}).get("exact_match,none", 0)*100, 1) if results.get("vizwiz_vqa_val", {}).get("exact_match,none") is not None else None # scores["SEEDBench-Image"] = round(results.get("seedbench", {}).get("seed_image,none", 0)*100, 1) if results.get("seedbench", {}).get("seed_image,none") is not None else None # scores["VQAv2-Val"] = round(results.get("vqav2_val", {}).get("exact_match,none", 0)*100, 1) if results.get("vqav2_val", {}).get("exact_match,none") is not None else None # scores["Science-QA-Img"] = round(results.get("scienceqa_img", {}).get("exact_match,none", 0)*100, 1) if results.get("scienceqa_img", {}).get("exact_match,none") is not None else None scores["MMMU-Val"] = round(results.get("mmmu_val", {}).get("mmmu_acc,none", 0)*100, 1) if results.get("mmmu_val", {}).get("mmmu_acc,none") is not None else None # scores["MMBench"] = round(results.get("mmbench_en_dev", {}).get("gpt_eval_score,none", 0), 1) if results.get("mmbench_en_dev", {}).get("gpt_eval_score,none") is not None else None # scores["NaturalBench"] = round(results.get("naturalbench", {}).get("mme_score,none", 0)*100, 1) if results.get("naturalbench", {}).get("mme_score,none") is not None else None # scores["GQA"] = round(results.get("gqa", {}).get("exact_match,none", 0)*100, 1) if results.get("gqa", {}).get("exact_match,none") is not None else None scores["POPE"] = round(results.get("pope", {}).get("pope_accuracy,none", 0)*100, 1) if results.get("pope", {}).get("pope_accuracy,none") is not None else None scores["MMVet"] = round(results.get("mmvet", {}).get("gpt_eval_score", 0)*100, 1) if results.get("mmvet", {}).get("gpt_eval_score") is not None else None scores["OK-VQA"] = round(results.get("ok_vqa", {}).get("exact_match,none", 0)*100, 1) if results.get("ok_vqa", {}).get("exact_match,none") is not None else None # scores["ChartQA"] = round(results.get("chartqa", {}).get("relaxed_overall,none", 0)*100, 1) if results.get("chartqa", {}).get("relaxed_overall,none") is not None else None # scores["DocVQA"] = round(results.get("docvqa_val", {}).get("anls,none", 0)*100, 1) if results.get("docvqa_val", {}).get("anls,none") is not None else None # scores["TextVQA"] = round(results.get("textvqa_val", {}).get("exact_match,none", 0)*100, 1) if results.get("textvqa_val", {}).get("exact_match,none") is not None else None try: with open(f"{args.results_folder}/{args.ckpt}/mmvp/merge_score.json", "r") as f: data = json.load(f) scores["MMVP"] = round(data.get("mmvp", 0)*100, 1) if data.get("mmvp") is not None else None except: scores["MMVP"] = None keys = list(scores.keys()) str_scores = [str(scores[key]) if scores[key] is not None else 'None' for key in keys] abl_keys = ["CV-Bench", "MMStar", "VizWiz-VQA-Val", "MMVet", "MMVP", "MMMU-Val"] abl_scores = [scores[key] for key in abl_keys if scores[key] is not None] small_abl_keys = ["CV-Bench", "MMStar", "OK-VQA", "MMMU-Val"] small_abl_scores = [scores[key] for key in small_abl_keys if scores[key] is not None] cv_bench_keys = ["CV-Bench (2D)", "CV-Bench (3D)", "CV-Bench (Count)", "CV-Bench (Depth)", "CV-Bench (Relation)", "CV-Bench (Distance)"] cv_bench_scores = [cv_scores[key] for key in cv_bench_keys if cv_scores[key] is not None] # cat_scores = {} # if os.path.exists(f"{args.results_folder}/{args.ckpt}/categorized_scores.json"): # with open(f"{args.results_folder}/{args.ckpt}/categorized_scores.json", "r") as f: # cat_scores = json.load(f) # cat_scores.pop("Both") print("\n====================All-Scores===========================================") print(" & ".join(keys)) print(" & ".join(str_scores)) if abl_scores: print("\n====================Abl-Scores===========================================") print(" & ".join(abl_keys)) print(" & ".join([str(a) for a in abl_scores])) print(f"Ablation Avg: {round(np.mean(abl_scores), 1)}") else: print("Ablation Avg: None") if small_abl_scores: print("\n====================Small-Abl-Scores===========================================") print(" & ".join(small_abl_keys)) print(" & ".join([str(a) for a in small_abl_scores])) print(f"Small-Ablation Avg: {round(np.mean(small_abl_scores), 1)}") else: print("Small-Ablation Avg: None") if cv_bench_scores: print("\n====================CV-Bench-Scores===========================================") print(" & ".join(cv_bench_keys)) print(" & ".join([str(c) for c in cv_bench_scores])) print(f"CV-Bench Overall: {round(np.mean(cv_bench_scores[:2]), 1)}") else: print("CV-Bench Avg: None") # if cat_scores is not None: # print("\n====================Categorized-Scores===========================================") # cats = [] # class_scores = [] # benches = [] # for k, v in cat_scores.items(): # cats.append(k) # for bench, score in v.items(): # benches.append(bench) # class_scores.append(round(score*100, 1)) # print(" & ".join(cats)) # print(" & ".join(benches)) # print(" & ".join([str(c) for c in class_scores])) # print("================================================================")