Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
feat: adapt to the latest data format
Browse files- app.py +4 -4
- src/benchmarks.py +2 -2
- src/display/utils.py +8 -0
- src/leaderboard/read_evals.py +19 -7
- tests/src/leaderboard/test_read_evals.py +17 -17
- utils.py +3 -3
app.py
CHANGED
|
@@ -27,12 +27,12 @@ try:
|
|
| 27 |
except Exception:
|
| 28 |
restart_space()
|
| 29 |
|
| 30 |
-
raw_data = get_raw_eval_results(EVAL_RESULTS_PATH)
|
| 31 |
|
| 32 |
original_df_qa = get_leaderboard_df(
|
| 33 |
raw_data, task='qa', metric='ndcg_at_3')
|
| 34 |
original_df_long_doc = get_leaderboard_df(
|
| 35 |
-
raw_data, task='
|
| 36 |
print(f'raw data: {len(raw_data)}')
|
| 37 |
print(f'QA data loaded: {original_df_qa.shape}')
|
| 38 |
print(f'Long-Doc data loaded: {len(original_df_long_doc)}')
|
|
@@ -42,7 +42,7 @@ shown_columns_qa = get_default_cols('qa', leaderboard_df_qa.columns, add_fix_col
|
|
| 42 |
leaderboard_df_qa = leaderboard_df_qa[shown_columns_qa]
|
| 43 |
|
| 44 |
leaderboard_df_long_doc = original_df_long_doc.copy()
|
| 45 |
-
shown_columns_long_doc = get_default_cols('
|
| 46 |
leaderboard_df_long_doc = leaderboard_df_long_doc[shown_columns_long_doc]
|
| 47 |
|
| 48 |
|
|
@@ -62,7 +62,7 @@ def update_metric_long_doc(
|
|
| 62 |
reranking_model: list,
|
| 63 |
query: str,
|
| 64 |
):
|
| 65 |
-
return update_metric(raw_data,
|
| 66 |
|
| 67 |
|
| 68 |
demo = gr.Blocks(css=custom_css)
|
|
|
|
| 27 |
except Exception:
|
| 28 |
restart_space()
|
| 29 |
|
| 30 |
+
raw_data = get_raw_eval_results(f"{EVAL_RESULTS_PATH}/AIR-Bench_24.04")
|
| 31 |
|
| 32 |
original_df_qa = get_leaderboard_df(
|
| 33 |
raw_data, task='qa', metric='ndcg_at_3')
|
| 34 |
original_df_long_doc = get_leaderboard_df(
|
| 35 |
+
raw_data, task='long-doc', metric='ndcg_at_3')
|
| 36 |
print(f'raw data: {len(raw_data)}')
|
| 37 |
print(f'QA data loaded: {original_df_qa.shape}')
|
| 38 |
print(f'Long-Doc data loaded: {len(original_df_long_doc)}')
|
|
|
|
| 42 |
leaderboard_df_qa = leaderboard_df_qa[shown_columns_qa]
|
| 43 |
|
| 44 |
leaderboard_df_long_doc = original_df_long_doc.copy()
|
| 45 |
+
shown_columns_long_doc = get_default_cols('long-doc', leaderboard_df_long_doc.columns, add_fix_cols=True)
|
| 46 |
leaderboard_df_long_doc = leaderboard_df_long_doc[shown_columns_long_doc]
|
| 47 |
|
| 48 |
|
|
|
|
| 62 |
reranking_model: list,
|
| 63 |
query: str,
|
| 64 |
):
|
| 65 |
+
return update_metric(raw_data, "long-doc", metric, domains, langs, reranking_model, query)
|
| 66 |
|
| 67 |
|
| 68 |
demo = gr.Blocks(css=custom_css)
|
src/benchmarks.py
CHANGED
|
@@ -40,7 +40,7 @@ dataset_dict = {
|
|
| 40 |
"arxiv": {
|
| 41 |
"en": ["Arxiv", ]},
|
| 42 |
},
|
| 43 |
-
"
|
| 44 |
"arxiv": {
|
| 45 |
"en": ["gpt-3", "llama2", "llm-survey", "gemini"],
|
| 46 |
},
|
|
@@ -125,7 +125,7 @@ for task, domain_dict in dataset_dict.items():
|
|
| 125 |
col_name = benchmark_name
|
| 126 |
for metric in dataset_list:
|
| 127 |
qa_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name, domain, lang, task)
|
| 128 |
-
elif task == "
|
| 129 |
for dataset in dataset_list:
|
| 130 |
benchmark_name = f"{domain}_{lang}_{dataset}"
|
| 131 |
benchmark_name = get_safe_name(benchmark_name)
|
|
|
|
| 40 |
"arxiv": {
|
| 41 |
"en": ["Arxiv", ]},
|
| 42 |
},
|
| 43 |
+
"long-doc": {
|
| 44 |
"arxiv": {
|
| 45 |
"en": ["gpt-3", "llama2", "llm-survey", "gemini"],
|
| 46 |
},
|
|
|
|
| 125 |
col_name = benchmark_name
|
| 126 |
for metric in dataset_list:
|
| 127 |
qa_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name, domain, lang, task)
|
| 128 |
+
elif task == "long-doc":
|
| 129 |
for dataset in dataset_list:
|
| 130 |
benchmark_name = f"{domain}_{lang}_{dataset}"
|
| 131 |
benchmark_name = get_safe_name(benchmark_name)
|
src/display/utils.py
CHANGED
|
@@ -22,6 +22,8 @@ class ColumnContent:
|
|
| 22 |
COL_NAME_AVG = "Average ⬆️"
|
| 23 |
COL_NAME_RETRIEVAL_MODEL = "Retrieval Model"
|
| 24 |
COL_NAME_RERANKING_MODEL = "Reranking Model"
|
|
|
|
|
|
|
| 25 |
COL_NAME_RANK = "Rank 🏆"
|
| 26 |
|
| 27 |
def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
|
|
@@ -34,6 +36,12 @@ def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
|
|
| 34 |
auto_eval_column_dict.append(
|
| 35 |
["reranking_model", ColumnContent, ColumnContent(COL_NAME_RERANKING_MODEL, "markdown", True, never_hidden=True)]
|
| 36 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
auto_eval_column_dict.append(
|
| 38 |
["average", ColumnContent, ColumnContent(COL_NAME_AVG, "number", True)]
|
| 39 |
)
|
|
|
|
| 22 |
COL_NAME_AVG = "Average ⬆️"
|
| 23 |
COL_NAME_RETRIEVAL_MODEL = "Retrieval Model"
|
| 24 |
COL_NAME_RERANKING_MODEL = "Reranking Model"
|
| 25 |
+
COL_NAME_RETRIEVAL_MODEL_LINK = "Retrieval Model LINK"
|
| 26 |
+
COL_NAME_RERANKING_MODEL_LINK = "Reranking Model LINK"
|
| 27 |
COL_NAME_RANK = "Rank 🏆"
|
| 28 |
|
| 29 |
def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
|
|
|
|
| 36 |
auto_eval_column_dict.append(
|
| 37 |
["reranking_model", ColumnContent, ColumnContent(COL_NAME_RERANKING_MODEL, "markdown", True, never_hidden=True)]
|
| 38 |
)
|
| 39 |
+
auto_eval_column_dict.append(
|
| 40 |
+
["retrieval_model_link", ColumnContent, ColumnContent(COL_NAME_RETRIEVAL_MODEL, "markdown", False, hidden=True, never_hidden=False)]
|
| 41 |
+
)
|
| 42 |
+
auto_eval_column_dict.append(
|
| 43 |
+
["reranking_model_link", ColumnContent, ColumnContent(COL_NAME_RERANKING_MODEL, "markdown", False, hidden=True, never_hidden=False)]
|
| 44 |
+
)
|
| 45 |
auto_eval_column_dict.append(
|
| 46 |
["average", ColumnContent, ColumnContent(COL_NAME_AVG, "number", True)]
|
| 47 |
)
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -12,6 +12,8 @@ from src.display.formatting import has_no_nan_values
|
|
| 12 |
from src.display.utils import (
|
| 13 |
COL_NAME_RERANKING_MODEL,
|
| 14 |
COL_NAME_RETRIEVAL_MODEL,
|
|
|
|
|
|
|
| 15 |
COLS_QA,
|
| 16 |
QA_BENCHMARK_COLS,
|
| 17 |
COLS_LONG_DOC,
|
|
@@ -44,6 +46,8 @@ class FullEvalResult:
|
|
| 44 |
eval_name: str # name of the evaluation, [retrieval_model]_[reranking_model]
|
| 45 |
retrieval_model: str
|
| 46 |
reranking_model: str
|
|
|
|
|
|
|
| 47 |
results: List[EvalResult] # results on all the EvalResults over different tasks and metrics.
|
| 48 |
date: str = ""
|
| 49 |
|
|
@@ -58,10 +62,15 @@ class FullEvalResult:
|
|
| 58 |
|
| 59 |
# store all the results for different metrics and tasks
|
| 60 |
result_list = []
|
|
|
|
|
|
|
| 61 |
for item in model_data:
|
| 62 |
config = item.get("config", {})
|
| 63 |
# eval results for different metrics
|
| 64 |
results = item.get("results", [])
|
|
|
|
|
|
|
|
|
|
| 65 |
eval_result = EvalResult(
|
| 66 |
eval_name=f"{config['retrieval_model']}_{config['reranking_model']}_{config['metric']}",
|
| 67 |
retrieval_model=config["retrieval_model"],
|
|
@@ -75,6 +84,8 @@ class FullEvalResult:
|
|
| 75 |
eval_name=f"{result_list[0].retrieval_model}_{result_list[0].reranking_model}",
|
| 76 |
retrieval_model=result_list[0].retrieval_model,
|
| 77 |
reranking_model=result_list[0].reranking_model,
|
|
|
|
|
|
|
| 78 |
results=result_list
|
| 79 |
)
|
| 80 |
|
|
@@ -91,6 +102,8 @@ class FullEvalResult:
|
|
| 91 |
results[eval_result.eval_name]["eval_name"] = eval_result.eval_name
|
| 92 |
results[eval_result.eval_name][COL_NAME_RETRIEVAL_MODEL] = self.retrieval_model
|
| 93 |
results[eval_result.eval_name][COL_NAME_RERANKING_MODEL] = self.reranking_model
|
|
|
|
|
|
|
| 94 |
|
| 95 |
# print(f'result loaded: {eval_result.eval_name}')
|
| 96 |
for result in eval_result.results:
|
|
@@ -99,9 +112,9 @@ class FullEvalResult:
|
|
| 99 |
lang = result["lang"]
|
| 100 |
dataset = result["dataset"]
|
| 101 |
value = result["value"]
|
| 102 |
-
if
|
| 103 |
benchmark_name = f"{domain}_{lang}"
|
| 104 |
-
|
| 105 |
benchmark_name = f"{domain}_{lang}_{dataset}"
|
| 106 |
results[eval_result.eval_name][get_safe_name(benchmark_name)] = value
|
| 107 |
return [v for v in results.values()]
|
|
@@ -115,13 +128,12 @@ def get_raw_eval_results(results_path: str) -> List[FullEvalResult]:
|
|
| 115 |
for root, dirs, files in os.walk(results_path):
|
| 116 |
if len(files) == 0:
|
| 117 |
continue
|
| 118 |
-
try:
|
| 119 |
-
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7], reverse=True)
|
| 120 |
-
except dateutil.parser._parser.ParserError:
|
| 121 |
-
files = [files[-1]]
|
| 122 |
|
| 123 |
# select the latest results
|
| 124 |
for file in files:
|
|
|
|
|
|
|
|
|
|
| 125 |
model_result_filepaths.append(os.path.join(root, file))
|
| 126 |
|
| 127 |
eval_results = {}
|
|
@@ -154,7 +166,7 @@ def get_leaderboard_df(raw_data: List[FullEvalResult], task: str, metric: str) -
|
|
| 154 |
if task == "qa":
|
| 155 |
cols = COLS_QA
|
| 156 |
benchmark_cols = QA_BENCHMARK_COLS
|
| 157 |
-
elif task == "
|
| 158 |
cols = COLS_LONG_DOC
|
| 159 |
benchmark_cols = LONG_DOC_BENCHMARK_COLS
|
| 160 |
else:
|
|
|
|
| 12 |
from src.display.utils import (
|
| 13 |
COL_NAME_RERANKING_MODEL,
|
| 14 |
COL_NAME_RETRIEVAL_MODEL,
|
| 15 |
+
COL_NAME_RERANKING_MODEL_LINK,
|
| 16 |
+
COL_NAME_RETRIEVAL_MODEL_LINK,
|
| 17 |
COLS_QA,
|
| 18 |
QA_BENCHMARK_COLS,
|
| 19 |
COLS_LONG_DOC,
|
|
|
|
| 46 |
eval_name: str # name of the evaluation, [retrieval_model]_[reranking_model]
|
| 47 |
retrieval_model: str
|
| 48 |
reranking_model: str
|
| 49 |
+
retrieval_model_link: str
|
| 50 |
+
reranking_model_link: str
|
| 51 |
results: List[EvalResult] # results on all the EvalResults over different tasks and metrics.
|
| 52 |
date: str = ""
|
| 53 |
|
|
|
|
| 62 |
|
| 63 |
# store all the results for different metrics and tasks
|
| 64 |
result_list = []
|
| 65 |
+
retrieval_model_link = ""
|
| 66 |
+
reranking_model_link = ""
|
| 67 |
for item in model_data:
|
| 68 |
config = item.get("config", {})
|
| 69 |
# eval results for different metrics
|
| 70 |
results = item.get("results", [])
|
| 71 |
+
retrieval_model_link=config["retreival_model_link"]
|
| 72 |
+
if config["reranking_model_link"] is not None:
|
| 73 |
+
reranking_model_link=""
|
| 74 |
eval_result = EvalResult(
|
| 75 |
eval_name=f"{config['retrieval_model']}_{config['reranking_model']}_{config['metric']}",
|
| 76 |
retrieval_model=config["retrieval_model"],
|
|
|
|
| 84 |
eval_name=f"{result_list[0].retrieval_model}_{result_list[0].reranking_model}",
|
| 85 |
retrieval_model=result_list[0].retrieval_model,
|
| 86 |
reranking_model=result_list[0].reranking_model,
|
| 87 |
+
retrieval_model_link=retrieval_model_link,
|
| 88 |
+
reranking_model_link=reranking_model_link,
|
| 89 |
results=result_list
|
| 90 |
)
|
| 91 |
|
|
|
|
| 102 |
results[eval_result.eval_name]["eval_name"] = eval_result.eval_name
|
| 103 |
results[eval_result.eval_name][COL_NAME_RETRIEVAL_MODEL] = self.retrieval_model
|
| 104 |
results[eval_result.eval_name][COL_NAME_RERANKING_MODEL] = self.reranking_model
|
| 105 |
+
results[eval_result.eval_name][COL_NAME_RETRIEVAL_MODEL_LINK] = self.retrieval_model_link
|
| 106 |
+
results[eval_result.eval_name][COL_NAME_RERANKING_MODEL_LINK] = self.reranking_model_link
|
| 107 |
|
| 108 |
# print(f'result loaded: {eval_result.eval_name}')
|
| 109 |
for result in eval_result.results:
|
|
|
|
| 112 |
lang = result["lang"]
|
| 113 |
dataset = result["dataset"]
|
| 114 |
value = result["value"]
|
| 115 |
+
if dataset == 'default':
|
| 116 |
benchmark_name = f"{domain}_{lang}"
|
| 117 |
+
else:
|
| 118 |
benchmark_name = f"{domain}_{lang}_{dataset}"
|
| 119 |
results[eval_result.eval_name][get_safe_name(benchmark_name)] = value
|
| 120 |
return [v for v in results.values()]
|
|
|
|
| 128 |
for root, dirs, files in os.walk(results_path):
|
| 129 |
if len(files) == 0:
|
| 130 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
|
| 132 |
# select the latest results
|
| 133 |
for file in files:
|
| 134 |
+
if file != "results.json":
|
| 135 |
+
print(f'skip {file}')
|
| 136 |
+
continue
|
| 137 |
model_result_filepaths.append(os.path.join(root, file))
|
| 138 |
|
| 139 |
eval_results = {}
|
|
|
|
| 166 |
if task == "qa":
|
| 167 |
cols = COLS_QA
|
| 168 |
benchmark_cols = QA_BENCHMARK_COLS
|
| 169 |
+
elif task == "long-doc":
|
| 170 |
cols = COLS_LONG_DOC
|
| 171 |
benchmark_cols = LONG_DOC_BENCHMARK_COLS
|
| 172 |
else:
|
tests/src/leaderboard/test_read_evals.py
CHANGED
|
@@ -28,35 +28,35 @@ def test_to_dict():
|
|
| 28 |
|
| 29 |
|
| 30 |
def test_get_raw_eval_results():
|
| 31 |
-
results_path = cur_fp.parents[2] / "toydata" / "
|
| 32 |
results = get_raw_eval_results(results_path)
|
| 33 |
# only load the latest results
|
| 34 |
-
assert len(results) ==
|
| 35 |
-
assert results[0].eval_name == "bge-
|
| 36 |
-
assert len(results[0].results) ==
|
| 37 |
-
assert results[
|
| 38 |
-
assert len(results[1].results) ==
|
| 39 |
|
| 40 |
|
| 41 |
def test_get_leaderboard_df():
|
| 42 |
-
results_path = cur_fp.parents[2] / "toydata" / "
|
| 43 |
raw_data = get_raw_eval_results(results_path)
|
| 44 |
-
df = get_leaderboard_df(raw_data, 'qa', '
|
| 45 |
-
assert df.shape[0] ==
|
| 46 |
# the results contain only one embedding model
|
| 47 |
-
for i in range(
|
| 48 |
-
|
| 49 |
-
# the results contain only two reranking model
|
| 50 |
-
assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
|
| 51 |
-
assert df["Reranking Model"][1] == "NoReranker"
|
| 52 |
-
assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
|
| 53 |
-
assert not df[['Average ⬆️', 'wiki_en', 'wiki_zh', ]].isnull().values.any()
|
| 54 |
|
| 55 |
|
| 56 |
def test_get_leaderboard_df_long_doc():
|
| 57 |
results_path = cur_fp.parents[2] / "toydata" / "test_results"
|
| 58 |
raw_data = get_raw_eval_results(results_path)
|
| 59 |
-
df = get_leaderboard_df(raw_data, '
|
| 60 |
assert df.shape[0] == 2
|
| 61 |
# the results contain only one embedding model
|
| 62 |
for i in range(2):
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
def test_get_raw_eval_results():
|
| 31 |
+
results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04"
|
| 32 |
results = get_raw_eval_results(results_path)
|
| 33 |
# only load the latest results
|
| 34 |
+
assert len(results) == 4
|
| 35 |
+
assert results[0].eval_name == "bge-base-en-v1.5_NoReranker"
|
| 36 |
+
assert len(results[0].results) == 70
|
| 37 |
+
assert results[0].eval_name == "bge-base-en-v1.5_bge-reranker-v2-m3"
|
| 38 |
+
assert len(results[1].results) == 70
|
| 39 |
|
| 40 |
|
| 41 |
def test_get_leaderboard_df():
|
| 42 |
+
results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04"
|
| 43 |
raw_data = get_raw_eval_results(results_path)
|
| 44 |
+
df = get_leaderboard_df(raw_data, 'qa', 'ndcg_at_3')
|
| 45 |
+
assert df.shape[0] == 4
|
| 46 |
# the results contain only one embedding model
|
| 47 |
+
# for i in range(4):
|
| 48 |
+
# assert df["Retrieval Model"][i] == "bge-m3"
|
| 49 |
+
# # the results contain only two reranking model
|
| 50 |
+
# assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
|
| 51 |
+
# assert df["Reranking Model"][1] == "NoReranker"
|
| 52 |
+
# assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
|
| 53 |
+
# assert not df[['Average ⬆️', 'wiki_en', 'wiki_zh', ]].isnull().values.any()
|
| 54 |
|
| 55 |
|
| 56 |
def test_get_leaderboard_df_long_doc():
|
| 57 |
results_path = cur_fp.parents[2] / "toydata" / "test_results"
|
| 58 |
raw_data = get_raw_eval_results(results_path)
|
| 59 |
+
df = get_leaderboard_df(raw_data, 'long-doc', 'ndcg_at_1')
|
| 60 |
assert df.shape[0] == 2
|
| 61 |
# the results contain only one embedding model
|
| 62 |
for i in range(2):
|
utils.py
CHANGED
|
@@ -47,7 +47,7 @@ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
|
| 47 |
def get_default_cols(task: str, columns: list, add_fix_cols: bool=True) -> list:
|
| 48 |
if task == "qa":
|
| 49 |
cols = list(frozenset(COLS_QA).intersection(frozenset(BENCHMARK_COLS_QA)).intersection(frozenset(columns)))
|
| 50 |
-
elif task == "
|
| 51 |
cols = list(frozenset(COLS_LONG_DOC).intersection(frozenset(BENCHMARK_COLS_LONG_DOC)).intersection(frozenset(columns)))
|
| 52 |
else:
|
| 53 |
raise NotImplemented
|
|
@@ -68,7 +68,7 @@ def select_columns(df: pd.DataFrame, domain_query: list, language_query: list, t
|
|
| 68 |
for c in cols:
|
| 69 |
if task == "qa":
|
| 70 |
eval_col = BenchmarksQA[c].value
|
| 71 |
-
elif task == "
|
| 72 |
eval_col = BenchmarksLongDoc[c].value
|
| 73 |
if eval_col.domain not in domain_query:
|
| 74 |
continue
|
|
@@ -127,7 +127,7 @@ def update_metric(
|
|
| 127 |
reranking_model,
|
| 128 |
query
|
| 129 |
)
|
| 130 |
-
elif task ==
|
| 131 |
leaderboard_df = get_leaderboard_df(raw_data, task=task, metric=metric)
|
| 132 |
return update_table_long_doc(
|
| 133 |
leaderboard_df,
|
|
|
|
| 47 |
def get_default_cols(task: str, columns: list, add_fix_cols: bool=True) -> list:
|
| 48 |
if task == "qa":
|
| 49 |
cols = list(frozenset(COLS_QA).intersection(frozenset(BENCHMARK_COLS_QA)).intersection(frozenset(columns)))
|
| 50 |
+
elif task == "long-doc":
|
| 51 |
cols = list(frozenset(COLS_LONG_DOC).intersection(frozenset(BENCHMARK_COLS_LONG_DOC)).intersection(frozenset(columns)))
|
| 52 |
else:
|
| 53 |
raise NotImplemented
|
|
|
|
| 68 |
for c in cols:
|
| 69 |
if task == "qa":
|
| 70 |
eval_col = BenchmarksQA[c].value
|
| 71 |
+
elif task == "long-doc":
|
| 72 |
eval_col = BenchmarksLongDoc[c].value
|
| 73 |
if eval_col.domain not in domain_query:
|
| 74 |
continue
|
|
|
|
| 127 |
reranking_model,
|
| 128 |
query
|
| 129 |
)
|
| 130 |
+
elif task == "long-doc":
|
| 131 |
leaderboard_df = get_leaderboard_df(raw_data, task=task, metric=metric)
|
| 132 |
return update_table_long_doc(
|
| 133 |
leaderboard_df,
|