import json import pandas as pd import gradio as gr from content import * from css import * NONE_COL = "Ranking" AGENT_COLS = ["Method", "Model" , "SS Easy", "SS Medium", "SS Hard", "MS Easy", "MS Meduium", "MS Hard", "Overall", NONE_COL] AGENT_TYPES = ["str", "str", "number", "number", "number", "number", "number", "number", "number", "number" , "number"] model_name_adic = { "qwen-plus": "Qwen-Plus", "qwen2.5-72b-instruct": "Qwen2.5-72B", "qwen2.5-7b-instruct": "Qwen2.5-7B", "qwen2.5-14b-instruct": "Qwen2.5-14B", "qwen2.5-32b-instruct": "Qwen2.5-32B", "gpt-4o": "GPT-4o", } method_name_adic = { "reflexion": "Relfexion", "react": "React", "seeker": "WebWalker", } rag_name_adic = { "kimi": "Kimi", "mindsearch": "MindSearch", "navie": "Navie RAG", "o1": "o1", "tongyi": "Tongyi", "wenxin": "ERNIE", "gemini": "Gemini", "gemini_search": "Gemini w/ Search", "doubao": "Doubao", } agent_ranking = [] with open("agents_result.jsonl", "r") as f: for line in f: item = json.loads(line) agent_ranking.append([method_name_adic[item["method"]], model_name_adic[item["model"]], item["overall"]]) agent_ranking = sorted(agent_ranking, key=lambda x: x[2], reverse=False) ranking_dict = {} for i, (method, model, score) in enumerate(agent_ranking): ranking_dict[method+model] = i agent_df = [] with open("agents_result.jsonl", "r") as f: for line in f: item = json.loads(line) agent_df.append([method_name_adic[item["method"]], model_name_adic[item["model"]], f"{item['ss_easy'] * 100:.2f}", f"{item['ss_medium'] * 100:.2f}", f"{item['ss_hard'] * 100:.2f}", f"{item['ms_easy'] * 100:.2f}", f"{item['ms_medium'] * 100:.2f}", f"{item['ms_hard'] * 100:.2f}", f"{item['overall'] * 100:.2f}", ranking_dict[method_name_adic[item["method"]] + model_name_adic[item["model"]]]]) agent_df = pd.DataFrame.from_records(agent_df, columns=AGENT_COLS) agent_df = agent_df.sort_values(by=["Ranking"], ascending=False) agent_df = agent_df[AGENT_COLS] RAG_COLS = ["System", "SS Easy", "SS Medium", "SS Hard", "MS Easy", "MS Meduium", "MS Hard", "Overall", NONE_COL] RAG_TYPES = ["str", "number", "number", "number", "number", "number", "number", "number", "number" , "number"] rag_ranking = [] with open("rag_result.jsonl", "r") as f: for line in f: item = json.loads(line) rag_ranking.append([rag_name_adic[item["system"]], item["overall"]]) rag_ranking = sorted(rag_ranking, key=lambda x: x[1], reverse=False) ranking_dict = {} for i, (system, score) in enumerate(rag_ranking): ranking_dict[system] = i rag_df = [] with open("rag_result.jsonl", "r") as f: for line in f: item = json.loads(line) rag_df.append([rag_name_adic[item["system"]], f"{item['ss_easy'] * 100:.2f}", f"{item['ss_medium'] * 100:.2f}", f"{item['ss_hard'] * 100:.2f}", f"{item['ms_easy'] * 100:.2f}", f"{item['ms_medium'] * 100:.2f}", f"{item['ms_hard'] * 100:.2f}", f"{item['overall'] * 100:.2f}", ranking_dict[rag_name_adic[item["system"]]]]) rag_df = pd.DataFrame.from_records(rag_df, columns=RAG_COLS) rag_df = rag_df.sort_values(by=["Ranking"], ascending=False) rag_df = rag_df[RAG_COLS] demo = gr.Blocks(css=CUSTOM_CSS) with demo: gr.HTML(TITLE) gr.Markdown(INTRO_TEXT, elem_classes="markdown-text") gr.Markdown(HOW_TO, elem_classes="markdown-text") gr.Markdown("## Leaderboard") with gr.Group(): with gr.Tab("Results: Agent 🤖️"): leaderboard_table_test = gr.components.Dataframe( value=agent_df, datatype=AGENT_TYPES, interactive=False, column_widths = ["20%"] * len(agent_df.columns) ) with gr.Tab("Results: RAG-system 🔍"): leaderboard_table_val = gr.components.Dataframe( value=rag_df, datatype=RAG_TYPES, interactive=False, column_widths=["20%"] ) gr.Markdown("SS denotes single-source, and MS denotes multi-source. Easy, Medium, and Hard denote the difficulty level of the question.") gr.Markdown(CREDIT, elem_classes="markdown-text") gr.Markdown(CITATION, elem_classes="markdown-text") demo.launch(share=True)