Eran Fainman
commited on
Commit
·
1b9b8b6
1
Parent(s):
c70e5b6
Add application file
Browse files
app.py
CHANGED
@@ -1,17 +1,29 @@
|
|
1 |
import os
|
|
|
2 |
import pandas as pd
|
3 |
import streamlit as st
|
4 |
import huggingface_hub as hh
|
|
|
5 |
|
6 |
# read files from HF
|
7 |
OWNER = "Booking-com"
|
|
|
|
|
8 |
REPO_ID = f"{OWNER}/streamlit-review-ranking-leaderboard"
|
9 |
RESULTS_REPO = f"{OWNER}/results"
|
10 |
GT_REPO = f"{OWNER}/accommodation-reviews-gt"
|
|
|
|
|
11 |
TOKEN = os.environ.get("HF_TOKEN")
|
12 |
CACHE_PATH = os.getenv("HF_HOME", ".")
|
|
|
13 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
|
|
14 |
GT_PATH = os.path.join(CACHE_PATH, "gt")
|
|
|
|
|
|
|
|
|
15 |
|
16 |
API = hh.HfApi(token=TOKEN)
|
17 |
|
@@ -20,24 +32,81 @@ def restart_space():
|
|
20 |
API.restart_space(repo_id=REPO_ID)
|
21 |
|
22 |
|
23 |
-
|
24 |
-
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
|
25 |
-
token=TOKEN
|
26 |
-
)
|
27 |
-
|
28 |
hh.snapshot_download(
|
29 |
repo_id=GT_REPO, local_dir=GT_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
|
30 |
token=TOKEN
|
31 |
)
|
32 |
|
33 |
|
34 |
-
def
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
mrr10 = 0.3
|
38 |
precision10 = 0.2
|
39 |
return mrr10, precision10
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
def render():
|
42 |
st.set_page_config(page_title="RecTour2024 - Booking.com Review Ranking Challenge Leaderboard", layout="wide")
|
43 |
st.title("🏆 RecTour2024 Leaderboard")
|
@@ -46,33 +115,41 @@ def render():
|
|
46 |
|
47 |
# leaderboard area
|
48 |
if leaderboard_tab.button("Refresh"):
|
49 |
-
|
50 |
-
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
|
51 |
-
token=TOKEN
|
52 |
-
)
|
53 |
|
54 |
-
df_results =
|
55 |
-
leaderboard_tab.
|
56 |
|
57 |
# submission area
|
58 |
-
|
59 |
model_name = submission_tab.text_input(label="Model name", value="")
|
60 |
-
|
61 |
-
|
|
|
62 |
if submission_tab.button("Upload"):
|
63 |
if not pred_file:
|
64 |
submission_tab.markdown("no file was submitted!")
|
65 |
else:
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
|
78 |
if __name__ == "__main__":
|
|
|
1 |
import os
|
2 |
+
import uuid
|
3 |
import pandas as pd
|
4 |
import streamlit as st
|
5 |
import huggingface_hub as hh
|
6 |
+
from datetime import datetime
|
7 |
|
8 |
# read files from HF
|
9 |
OWNER = "Booking-com"
|
10 |
+
MAX_SUBMISSIONS = 20
|
11 |
+
|
12 |
REPO_ID = f"{OWNER}/streamlit-review-ranking-leaderboard"
|
13 |
RESULTS_REPO = f"{OWNER}/results"
|
14 |
GT_REPO = f"{OWNER}/accommodation-reviews-gt"
|
15 |
+
GROUPS_INFO_REPO = f"{OWNER}/rectour2024-groups"
|
16 |
+
|
17 |
TOKEN = os.environ.get("HF_TOKEN")
|
18 |
CACHE_PATH = os.getenv("HF_HOME", ".")
|
19 |
+
|
20 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
21 |
+
TEMP_RESULTS_PATH = os.path.join(CACHE_PATH, "temp-results")
|
22 |
GT_PATH = os.path.join(CACHE_PATH, "gt")
|
23 |
+
GROUPS_INFO_PATH = os.path.join(CACHE_PATH, "groups-info")
|
24 |
+
|
25 |
+
REQUIRED_COLUMNS = ['accommodation_id', 'user_id'] + [f'review_{i}' for i in range(1, 11)]
|
26 |
+
|
27 |
|
28 |
API = hh.HfApi(token=TOKEN)
|
29 |
|
|
|
32 |
API.restart_space(repo_id=REPO_ID)
|
33 |
|
34 |
|
35 |
+
# download the GT - shouldn't update too frequent
|
|
|
|
|
|
|
|
|
36 |
hh.snapshot_download(
|
37 |
repo_id=GT_REPO, local_dir=GT_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
|
38 |
token=TOKEN
|
39 |
)
|
40 |
|
41 |
|
42 |
+
def refresh_data():
|
43 |
+
hh.snapshot_download(
|
44 |
+
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
|
45 |
+
token=TOKEN
|
46 |
+
)
|
47 |
+
|
48 |
+
hh.snapshot_download(
|
49 |
+
repo_id=GROUPS_INFO_REPO, local_dir=GROUPS_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
|
50 |
+
token=TOKEN
|
51 |
+
)
|
52 |
+
|
53 |
+
|
54 |
+
refresh_data()
|
55 |
+
|
56 |
+
|
57 |
+
def calculate_metrics(df_pred):
|
58 |
+
df_gt = pd.read_csv(os.path.join(GT_PATH, 'val_matches.csv'))
|
59 |
+
|
60 |
mrr10 = 0.3
|
61 |
precision10 = 0.2
|
62 |
return mrr10, precision10
|
63 |
|
64 |
+
|
65 |
+
def get_group_name_by_email(email):
|
66 |
+
df = pd.read_csv(GROUPS_INFO_PATH, 'groups_data.csv')
|
67 |
+
df_email = df[df['email'] == email].reset_index(drop=True)
|
68 |
+
if len(df_email) > 0:
|
69 |
+
return df_email.iloc[0]['group_name']
|
70 |
+
else:
|
71 |
+
raise Exception("E-mail is not valid")
|
72 |
+
|
73 |
+
|
74 |
+
def validate_pred_file(df_pred):
|
75 |
+
for col in REQUIRED_COLUMNS:
|
76 |
+
if col not in df_pred.columns:
|
77 |
+
raise Exception(f"Column {col} not in prediction file")
|
78 |
+
|
79 |
+
|
80 |
+
def get_revision(df_results, email):
|
81 |
+
df_group_data = df_results[df_results['email'] == email]
|
82 |
+
if len(df_group_data) > 0:
|
83 |
+
return df_group_data['revision'].max()
|
84 |
+
else:
|
85 |
+
return 0
|
86 |
+
|
87 |
+
|
88 |
+
def get_results_dataframe():
|
89 |
+
dfs = []
|
90 |
+
for f in os.listdir(EVAL_RESULTS_PATH):
|
91 |
+
if f.endswith('.csv'):
|
92 |
+
dfs.append(pd.read_csv(os.path.join(EVAL_RESULTS_PATH, f)))
|
93 |
+
return pd.concat(dfs)
|
94 |
+
|
95 |
+
|
96 |
+
def upload_results(group_email, group_name, model_name, revision, mrr10, precision10):
|
97 |
+
submission_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
98 |
+
|
99 |
+
if not os.path.exists(TEMP_RESULTS_PATH):
|
100 |
+
os.mkdir(TEMP_RESULTS_PATH)
|
101 |
+
|
102 |
+
df_temp_results = pd.DataFrame({'email': [group_email], 'group_name': [group_name], model_name: [model_name],
|
103 |
+
"submission_date": [submission_date], "revision": [revision], "MRR10": [mrr10],
|
104 |
+
"PRECISION10": [precision10]})
|
105 |
+
temp_path = os.path.join(TEMP_RESULTS_PATH, str(uuid.uuid4()) + '.csv')
|
106 |
+
df_temp_results.to_csv(temp_path, index=False)
|
107 |
+
hh.upload_file(path_or_fileobj=temp_path, repo_id=RESULTS_REPO, token=TOKEN, repo_type="dataset")
|
108 |
+
|
109 |
+
|
110 |
def render():
|
111 |
st.set_page_config(page_title="RecTour2024 - Booking.com Review Ranking Challenge Leaderboard", layout="wide")
|
112 |
st.title("🏆 RecTour2024 Leaderboard")
|
|
|
115 |
|
116 |
# leaderboard area
|
117 |
if leaderboard_tab.button("Refresh"):
|
118 |
+
refresh_data()
|
|
|
|
|
|
|
119 |
|
120 |
+
df_results = get_results_dataframe()
|
121 |
+
leaderboard_tab.dataframe(df_results.drop(columns=['email']))
|
122 |
|
123 |
# submission area
|
124 |
+
group_email = submission_tab.text_input(label="Group email", value="")
|
125 |
model_name = submission_tab.text_input(label="Model name", value="")
|
126 |
+
pred_file = submission_tab.file_uploader(label="Upload your prediction file",
|
127 |
+
help="Upload a csv.zip file, in pandas this can be achieved "
|
128 |
+
"with df.to_csv(<file_path>, compression='zip')",)
|
129 |
if submission_tab.button("Upload"):
|
130 |
if not pred_file:
|
131 |
submission_tab.markdown("no file was submitted!")
|
132 |
else:
|
133 |
+
try:
|
134 |
+
group_name = get_group_name_by_email(group_email)
|
135 |
+
df_pred = pd.read_csv(pred_file, compression='zip')
|
136 |
+
validate_pred_file(df_pred)
|
137 |
+
mrr10, precision10 = calculate_metrics(df_pred)
|
138 |
+
revision = get_revision(df_results=df_results, email=group_email) + 1 # generate next revision id
|
139 |
+
upload_results(group_email=group_email, group_name=group_name, model_name=model_name, revision=revision,
|
140 |
+
mrr10=mrr10, precision10=precision10)
|
141 |
+
|
142 |
+
submission_tab.markdown("## THANK YOU FOR YOUR SUBMISSION!")
|
143 |
+
submission_tab.markdown("Here are your submission details:")
|
144 |
+
submission_tab.markdown("**Group name:** " + group_name)
|
145 |
+
submission_tab.markdown("**Model name:** " + model_name)
|
146 |
+
submission_tab.markdown("**Revision:** " + revision + "(out of {MAX_SUBMISSIONS} allowed submissions)")
|
147 |
+
|
148 |
+
submission_tab.write("### Submission results")
|
149 |
+
submission_tab.markdown("**MRR@10:** {:.4f}".format(mrr10))
|
150 |
+
submission_tab.markdown("**Precision@10:** {:.4f}".format(precision10))
|
151 |
+
except Exception as e:
|
152 |
+
submission_tab.markdown(e)
|
153 |
|
154 |
|
155 |
if __name__ == "__main__":
|