|
import os |
|
import uuid |
|
import numpy as np |
|
import pandas as pd |
|
import streamlit as st |
|
import huggingface_hub as hh |
|
from datetime import datetime |
|
|
|
|
|
OWNER = "Booking-com" |
|
MAX_SUBMISSIONS = 100 |
|
|
|
REPO_ID = f"{OWNER}/streamlit-review-ranking-leaderboard" |
|
RESULTS_REPO = f"{OWNER}/results" |
|
GT_REPO = f"{OWNER}/accommodation-reviews-gt" |
|
GROUPS_INFO_REPO = f"{OWNER}/rectour2024-groups" |
|
|
|
TOKEN = os.environ.get("HF_TOKEN") |
|
CACHE_PATH = os.getenv("HF_HOME", ".") |
|
|
|
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results") |
|
TEMP_RESULTS_PATH = os.path.join(CACHE_PATH, "temp-results") |
|
GT_PATH = os.path.join(CACHE_PATH, "gt") |
|
GROUPS_INFO_PATH = os.path.join(CACHE_PATH, "groups-info") |
|
|
|
REQUIRED_COLUMNS = ['accommodation_id', 'user_id'] + [f'review_{i}' for i in range(1, 11)] |
|
|
|
API = hh.HfApi(token=TOKEN) |
|
|
|
|
|
def restart_space(): |
|
API.restart_space(repo_id=REPO_ID) |
|
|
|
|
|
|
|
hh.snapshot_download( |
|
repo_id=GT_REPO, local_dir=GT_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, |
|
token=TOKEN |
|
) |
|
|
|
|
|
def refresh_data(): |
|
hh.snapshot_download( |
|
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, |
|
token=TOKEN |
|
) |
|
|
|
hh.snapshot_download( |
|
repo_id=GROUPS_INFO_REPO, local_dir=GROUPS_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, |
|
token=TOKEN |
|
) |
|
|
|
|
|
refresh_data() |
|
|
|
|
|
def get_match_index(row): |
|
for i in range(1, 11): |
|
if row['review_id'] == row[f'review_{i}']: |
|
return i |
|
return np.inf |
|
|
|
|
|
def calculate_metrics(df_pred): |
|
df_gt = pd.read_csv(os.path.join(GT_PATH, 'test_matches.csv')) |
|
if len(df_pred) != len(df_gt): |
|
raise Exception("Your predictions file should contain {} rows, only {} rows were found in the file".format( |
|
len(df_gt), len(df_pred) |
|
)) |
|
|
|
df_merged = pd.merge(df_gt, df_pred, how='left', on=['accommodation_id', 'user_id']).fillna('') |
|
df_merged['match_index'] = df_merged.apply(get_match_index, axis=1) |
|
df_merged['mrr10'] = df_merged['match_index'].apply(lambda x: 1/x) |
|
df_merged['precision10'] = df_merged['match_index'].apply(lambda x: 1 if x != np.inf else 0) |
|
|
|
return df_merged['mrr10'].mean(), df_merged['precision10'].mean() |
|
|
|
|
|
def get_group_name_by_email(email): |
|
df = pd.read_csv(os.path.join(GROUPS_INFO_PATH, 'groups_data.csv')) |
|
df_email = df[df['email'] == email].reset_index(drop=True) |
|
if len(df_email) > 0: |
|
return df_email.iloc[0]['group_name'] |
|
else: |
|
raise Exception("E-mail is not valid") |
|
|
|
|
|
def validate_pred_file(df_pred): |
|
for col in REQUIRED_COLUMNS: |
|
if col not in df_pred.columns: |
|
raise Exception(f"Column {col} not in prediction file") |
|
|
|
|
|
def get_revision(df_results, email): |
|
df_group_data = df_results[df_results['email'] == email] |
|
curr_revision = 0 |
|
if len(df_group_data) > 0: |
|
curr_revision = df_group_data['revision'].max() |
|
if curr_revision >= MAX_SUBMISSIONS: |
|
raise Exception("We're sorry but you reached your maximal number of submissions") |
|
return curr_revision |
|
|
|
|
|
def get_results_dataframe(): |
|
dfs = [] |
|
for f in os.listdir(EVAL_RESULTS_PATH): |
|
if f.endswith('.csv'): |
|
dfs.append(pd.read_csv(os.path.join(EVAL_RESULTS_PATH, f))) |
|
return pd.concat(dfs) |
|
|
|
|
|
def upload_results(group_email, group_name, model_name, revision, mrr10, precision10): |
|
submission_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S") |
|
|
|
if not os.path.exists(TEMP_RESULTS_PATH): |
|
os.mkdir(TEMP_RESULTS_PATH) |
|
|
|
df_temp_results = pd.DataFrame({'email': [group_email], 'group_name': [group_name], "model_name": [model_name], |
|
"submission_date": [submission_date], "revision": [revision], "MRR@10": [mrr10], |
|
"Precision@10": [precision10], 'set_name': ["test set"]}) |
|
temp_results_fn = str(uuid.uuid4()) + '.csv' |
|
temp_path = os.path.join(TEMP_RESULTS_PATH, temp_results_fn) |
|
df_temp_results.to_csv(temp_path, index=False) |
|
hh.upload_file(path_or_fileobj=temp_path, repo_id=RESULTS_REPO, token=TOKEN, repo_type="dataset", |
|
path_in_repo=temp_results_fn) |
|
|
|
|
|
def render(): |
|
st.set_page_config(page_title="RecTour2024 - Booking.com Review Ranking Challenge Leaderboard", layout="wide") |
|
st.title("π RecTour2024 Leaderboard") |
|
|
|
leaderboard_tab, submission_tab = st.tabs(["Leaderboard", "Submission"]) |
|
|
|
|
|
if leaderboard_tab.button("Refresh"): |
|
refresh_data() |
|
|
|
df_results = get_results_dataframe() |
|
leaderboard_tab.dataframe(df_results.drop(columns=['email']).sort_values(['set_name', 'MRR@10'], |
|
ascending=[True, False])) |
|
|
|
|
|
group_email = submission_tab.text_input(label="Group email", value="") |
|
model_name = submission_tab.text_input(label="Model name", value="") |
|
pred_file = submission_tab.file_uploader(label="Upload your prediction file", |
|
help="Upload a csv.zip file, in pandas this can be achieved " |
|
"with df.to_csv(<file_path>, compression='zip')",) |
|
if submission_tab.button("Upload"): |
|
if not pred_file: |
|
submission_tab.markdown("no file was submitted!") |
|
else: |
|
try: |
|
group_name = get_group_name_by_email(group_email) |
|
df_pred = pd.read_csv(pred_file, compression='zip') |
|
validate_pred_file(df_pred) |
|
mrr10, precision10 = calculate_metrics(df_pred) |
|
revision = get_revision(df_results=df_results, email=group_email) + 1 |
|
upload_results(group_email=group_email, group_name=group_name, model_name=model_name, revision=revision, |
|
mrr10=mrr10, precision10=precision10) |
|
|
|
submission_tab.markdown("## THANK YOU FOR YOUR SUBMISSION!") |
|
submission_tab.markdown("Here are your submission details:") |
|
submission_tab.markdown("**Group name:** " + group_name) |
|
submission_tab.markdown("**Model name:** " + model_name) |
|
submission_tab.markdown("**Revision:** " + str(revision) + |
|
f" (out of {MAX_SUBMISSIONS} allowed submissions)") |
|
|
|
submission_tab.write("### Submission results") |
|
submission_tab.markdown("**MRR@10:** {:.4f}".format(mrr10)) |
|
submission_tab.markdown("**Precision@10:** {:.4f}".format(precision10)) |
|
except Exception as e: |
|
submission_tab.markdown(e) |
|
|
|
|
|
if __name__ == "__main__": |
|
render() |
|
|
|
|