callanwu's picture
update
404f089
import json
import pandas as pd
import gradio as gr
from content import *
from css import *
NONE_COL = "Ranking"
AGENT_COLS = ["Method", "Model" , "SS Easy", "SS Medium", "SS Hard", "MS Easy", "MS Meduium", "MS Hard", "Overall", NONE_COL]
AGENT_TYPES = ["str", "str", "number", "number", "number", "number", "number", "number", "number", "number" , "number"]
model_name_adic = {
"qwen-plus": "Qwen-Plus",
"qwen2.5-72b-instruct": "Qwen2.5-72B",
"qwen2.5-7b-instruct": "Qwen2.5-7B",
"qwen2.5-14b-instruct": "Qwen2.5-14B",
"qwen2.5-32b-instruct": "Qwen2.5-32B",
"gpt-4o": "GPT-4o",
}
method_name_adic = {
"reflexion": "Relfexion",
"react": "React",
"seeker": "WebWalker",
}
rag_name_adic = {
"kimi": "Kimi",
"mindsearch": "MindSearch",
"navie": "Navie RAG",
"o1": "o1",
"tongyi": "Tongyi",
"wenxin": "ERNIE",
"gemini": "Gemini",
"gemini_search": "Gemini w/ Search",
"doubao": "Doubao",
}
agent_ranking = []
with open("agents_result.jsonl", "r") as f:
for line in f:
item = json.loads(line)
agent_ranking.append([method_name_adic[item["method"]], model_name_adic[item["model"]], item["overall"]])
agent_ranking = sorted(agent_ranking, key=lambda x: x[2], reverse=False)
ranking_dict = {}
for i, (method, model, score) in enumerate(agent_ranking):
ranking_dict[method+model] = i
agent_df = []
with open("agents_result.jsonl", "r") as f:
for line in f:
item = json.loads(line)
agent_df.append([method_name_adic[item["method"]], model_name_adic[item["model"]],
f"{item['ss_easy'] * 100:.2f}",
f"{item['ss_medium'] * 100:.2f}",
f"{item['ss_hard'] * 100:.2f}",
f"{item['ms_easy'] * 100:.2f}",
f"{item['ms_medium'] * 100:.2f}",
f"{item['ms_hard'] * 100:.2f}",
f"{item['overall'] * 100:.2f}",
ranking_dict[method_name_adic[item["method"]] + model_name_adic[item["model"]]]])
agent_df = pd.DataFrame.from_records(agent_df, columns=AGENT_COLS)
agent_df = agent_df.sort_values(by=["Ranking"], ascending=False)
agent_df = agent_df[AGENT_COLS]
RAG_COLS = ["System", "SS Easy", "SS Medium", "SS Hard", "MS Easy", "MS Meduium", "MS Hard", "Overall", NONE_COL]
RAG_TYPES = ["str", "number", "number", "number", "number", "number", "number", "number", "number" , "number"]
rag_ranking = []
with open("rag_result.jsonl", "r") as f:
for line in f:
item = json.loads(line)
rag_ranking.append([rag_name_adic[item["system"]], item["overall"]])
rag_ranking = sorted(rag_ranking, key=lambda x: x[1], reverse=False)
ranking_dict = {}
for i, (system, score) in enumerate(rag_ranking):
ranking_dict[system] = i
rag_df = []
with open("rag_result.jsonl", "r") as f:
for line in f:
item = json.loads(line)
rag_df.append([rag_name_adic[item["system"]],
f"{item['ss_easy'] * 100:.2f}",
f"{item['ss_medium'] * 100:.2f}",
f"{item['ss_hard'] * 100:.2f}",
f"{item['ms_easy'] * 100:.2f}",
f"{item['ms_medium'] * 100:.2f}",
f"{item['ms_hard'] * 100:.2f}",
f"{item['overall'] * 100:.2f}",
ranking_dict[rag_name_adic[item["system"]]]])
rag_df = pd.DataFrame.from_records(rag_df, columns=RAG_COLS)
rag_df = rag_df.sort_values(by=["Ranking"], ascending=False)
rag_df = rag_df[RAG_COLS]
demo = gr.Blocks(css=CUSTOM_CSS)
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRO_TEXT, elem_classes="markdown-text")
gr.Markdown(HOW_TO, elem_classes="markdown-text")
gr.Markdown("## Leaderboard")
with gr.Group():
with gr.Tab("Results: Agent πŸ€–οΈ"):
leaderboard_table_test = gr.components.Dataframe(
value=agent_df, datatype=AGENT_TYPES, interactive=False,
column_widths = ["20%"] * len(agent_df.columns)
)
with gr.Tab("Results: RAG-system πŸ”"):
leaderboard_table_val = gr.components.Dataframe(
value=rag_df, datatype=RAG_TYPES, interactive=False,
column_widths=["20%"]
)
gr.Markdown("SS denotes single-source, and MS denotes multi-source. Easy, Medium, and Hard denote the difficulty level of the question.")
gr.Markdown(CREDIT, elem_classes="markdown-text")
gr.Markdown(CITATION, elem_classes="markdown-text")
demo.launch(share=True)