import os |
import uuid |
import numpy as np |
import pandas as pd |
import streamlit as st |
import huggingface_hub as hh |
from datetime import datetime |
OWNER = "Booking-com" |
REPO_ID = f"{OWNER}/streamlit-review-ranking-leaderboard" |
RESULTS_REPO = f"{OWNER}/results" |
GT_REPO = f"{OWNER}/accommodation-reviews-gt" |
GROUPS_INFO_REPO = f"{OWNER}/rectour2024-groups" |
TOKEN = os.environ.get("HF_TOKEN") |
CACHE_PATH = os.getenv("HF_HOME", ".") |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results") |
TEMP_RESULTS_PATH = os.path.join(CACHE_PATH, "temp-results") |
GT_PATH = os.path.join(CACHE_PATH, "gt") |
GROUPS_INFO_PATH = os.path.join(CACHE_PATH, "groups-info") |
REQUIRED_COLUMNS = ['accommodation_id', 'user_id'] + [f'review_{i}' for i in range(1, 11)] |
API = hh.HfApi(token=TOKEN) |
def restart_space(): |
API.restart_space(repo_id=REPO_ID) |
hh.snapshot_download( |
repo_id=GT_REPO, local_dir=GT_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, |
token=TOKEN |
) |
def refresh_data(): |
hh.snapshot_download( |
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, |
token=TOKEN |
) |
hh.snapshot_download( |
repo_id=GROUPS_INFO_REPO, local_dir=GROUPS_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, |
token=TOKEN |
) |
refresh_data() |
def get_match_index(row): |
for i in range(1, 11): |
if row['review_id'] == row[f'review_{i}']: |
return i |
return np.inf |
def calculate_metrics(df_pred): |
df_gt = pd.read_csv(os.path.join(GT_PATH, 'test_matches.csv')) |
if len(df_pred) != len(df_gt): |
raise Exception("Your predictions file should contain {} rows, only {} rows were found in the file".format( |
len(df_gt), len(df_pred) |
)) |
df_merged = pd.merge(df_gt, df_pred, how='left', on=['accommodation_id', 'user_id']).fillna('') |
df_merged['match_index'] = df_merged.apply(get_match_index, axis=1) |
df_merged['mrr10'] = df_merged['match_index'].apply(lambda x: 1/x) |
df_merged['precision10'] = df_merged['match_index'].apply(lambda x: 1 if x != np.inf else 0) |
return df_merged['mrr10'].mean(), df_merged['precision10'].mean() |
def get_group_name_by_email(email): |
df = pd.read_csv(os.path.join(GROUPS_INFO_PATH, 'groups_data.csv')) |
df_email = df[df['email'] == email].reset_index(drop=True) |
if len(df_email) > 0: |
return df_email.iloc[0]['group_name'] |
else: |
raise Exception("E-mail is not valid") |
def validate_pred_file(df_pred): |
for col in REQUIRED_COLUMNS: |
if col not in df_pred.columns: |
raise Exception(f"Column {col} not in prediction file") |
def get_revision(df_results, email): |
df_group_data = df_results[df_results['email'] == email] |
curr_revision = 0 |
if len(df_group_data) > 0: |
curr_revision = df_group_data['revision'].max() |
if curr_revision >= MAX_SUBMISSIONS: |
raise Exception("We're sorry but you reached your maximal number of submissions") |
return curr_revision |
def get_results_dataframe(): |
dfs = [] |
for f in os.listdir(EVAL_RESULTS_PATH): |
if f.endswith('.csv'): |
dfs.append(pd.read_csv(os.path.join(EVAL_RESULTS_PATH, f))) |
return pd.concat(dfs) |
def upload_results(group_email, group_name, model_name, revision, mrr10, precision10): |
submission_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S") |
if not os.path.exists(TEMP_RESULTS_PATH): |
df_temp_results = pd.DataFrame({'email': [group_email], 'group_name': [group_name], "model_name": [model_name], |
"submission_date": [submission_date], "revision": [revision], "MRR@10": [mrr10], |
"Precision@10": [precision10]}) |
temp_results_fn = str(uuid.uuid4()) + '.csv' |
temp_path = os.path.join(TEMP_RESULTS_PATH, temp_results_fn) |
df_temp_results.to_csv(temp_path, index=False) |
hh.upload_file(path_or_fileobj=temp_path, repo_id=RESULTS_REPO, token=TOKEN, repo_type="dataset", |
path_in_repo=temp_results_fn) |
def render(): |
st.set_page_config(page_title="RecTour2024 - Booking.com Review Ranking Challenge Leaderboard", layout="wide") |
st.title("π RecTour2024 Leaderboard") |
leaderboard_tab, submission_tab = st.tabs(["Leaderboard", "Submission"]) |
if leaderboard_tab.button("Refresh"): |
refresh_data() |
df_results = get_results_dataframe() |
leaderboard_tab.dataframe(df_results.drop(columns=['email'])) |
group_email = submission_tab.text_input(label="Group email", value="") |
model_name = submission_tab.text_input(label="Model name", value="") |
pred_file = submission_tab.file_uploader(label="Upload your prediction file", |
help="Upload a csv.zip file, in pandas this can be achieved " |
"with df.to_csv(<file_path>, compression='zip')",) |
if submission_tab.button("Upload"): |
if not pred_file: |
submission_tab.markdown("no file was submitted!") |
else: |
try: |
group_name = get_group_name_by_email(group_email) |
df_pred = pd.read_csv(pred_file, compression='zip') |
validate_pred_file(df_pred) |
mrr10, precision10 = calculate_metrics(df_pred) |
revision = get_revision(df_results=df_results, email=group_email) + 1 |
upload_results(group_email=group_email, group_name=group_name, model_name=model_name, revision=revision, |
mrr10=mrr10, precision10=precision10) |
submission_tab.markdown("## THANK YOU FOR YOUR SUBMISSION!") |
submission_tab.markdown("Here are your submission details:") |
submission_tab.markdown("**Group name:** " + group_name) |
submission_tab.markdown("**Model name:** " + model_name) |
submission_tab.markdown("**Revision:** " + str(revision) + |
f" (out of {MAX_SUBMISSIONS} allowed submissions)") |
submission_tab.write("### Submission results") |
submission_tab.markdown("**MRR@10:** {:.4f}".format(mrr10)) |
submission_tab.markdown("**Precision@10:** {:.4f}".format(precision10)) |
except Exception as e: |
submission_tab.markdown(e) |
if __name__ == "__main__": |
render() |