Eran Fainman commited on
Commit
1b9b8b6
·
1 Parent(s): c70e5b6

Add application file

Browse files
Files changed (1) hide show
  1. app.py +104 -27
app.py CHANGED
@@ -1,17 +1,29 @@
1
  import os
 
2
  import pandas as pd
3
  import streamlit as st
4
  import huggingface_hub as hh
 
5
 
6
  # read files from HF
7
  OWNER = "Booking-com"
 
 
8
  REPO_ID = f"{OWNER}/streamlit-review-ranking-leaderboard"
9
  RESULTS_REPO = f"{OWNER}/results"
10
  GT_REPO = f"{OWNER}/accommodation-reviews-gt"
 
 
11
  TOKEN = os.environ.get("HF_TOKEN")
12
  CACHE_PATH = os.getenv("HF_HOME", ".")
 
13
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 
14
  GT_PATH = os.path.join(CACHE_PATH, "gt")
 
 
 
 
15
 
16
  API = hh.HfApi(token=TOKEN)
17
 
@@ -20,24 +32,81 @@ def restart_space():
20
  API.restart_space(repo_id=REPO_ID)
21
 
22
 
23
- hh.snapshot_download(
24
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
25
- token=TOKEN
26
- )
27
-
28
  hh.snapshot_download(
29
  repo_id=GT_REPO, local_dir=GT_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
30
  token=TOKEN
31
  )
32
 
33
 
34
- def calculate_metrics(pred_file):
35
- df = pd.read_csv(pred_file)
36
- gt = pd.read_csv(os.path.join(GT_PATH, 'val_matches.csv'))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  mrr10 = 0.3
38
  precision10 = 0.2
39
  return mrr10, precision10
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  def render():
42
  st.set_page_config(page_title="RecTour2024 - Booking.com Review Ranking Challenge Leaderboard", layout="wide")
43
  st.title("🏆 RecTour2024 Leaderboard")
@@ -46,33 +115,41 @@ def render():
46
 
47
  # leaderboard area
48
  if leaderboard_tab.button("Refresh"):
49
- hh.snapshot_download(
50
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
51
- token=TOKEN
52
- )
53
 
54
- df_results = pd.read_csv(os.path.join(EVAL_RESULTS_PATH, 'results.csv'))
55
- leaderboard_tab.table(df_results)
56
 
57
  # submission area
58
- group_name = submission_tab.text_input(label="Group name", value="")
59
  model_name = submission_tab.text_input(label="Model name", value="")
60
- group_token = submission_tab.text_input(label="Group token", value="")
61
- pred_file = submission_tab.file_uploader(label="Upload your prediction file")
 
62
  if submission_tab.button("Upload"):
63
  if not pred_file:
64
  submission_tab.markdown("no file was submitted!")
65
  else:
66
- mrr10, precision10 = calculate_metrics(pred_file)
67
- submission_tab.markdown("## THANK YOU FOR YOUR SUBMISSION!")
68
- submission_tab.markdown("Here are your submission details:")
69
- submission_tab.markdown("**Group name:** " + group_name)
70
- submission_tab.markdown("**Model name:** " + model_name)
71
- submission_tab.markdown("**Revision:** ")
72
-
73
- submission_tab.write("### Submission results")
74
- submission_tab.markdown("**MRR@10:** {:.4f}".format(mrr10))
75
- submission_tab.markdown("**Precision@10:** {:.4f}".format(precision10))
 
 
 
 
 
 
 
 
 
 
76
 
77
 
78
  if __name__ == "__main__":
 
1
  import os
2
+ import uuid
3
  import pandas as pd
4
  import streamlit as st
5
  import huggingface_hub as hh
6
+ from datetime import datetime
7
 
8
  # read files from HF
9
  OWNER = "Booking-com"
10
+ MAX_SUBMISSIONS = 20
11
+
12
  REPO_ID = f"{OWNER}/streamlit-review-ranking-leaderboard"
13
  RESULTS_REPO = f"{OWNER}/results"
14
  GT_REPO = f"{OWNER}/accommodation-reviews-gt"
15
+ GROUPS_INFO_REPO = f"{OWNER}/rectour2024-groups"
16
+
17
  TOKEN = os.environ.get("HF_TOKEN")
18
  CACHE_PATH = os.getenv("HF_HOME", ".")
19
+
20
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
21
+ TEMP_RESULTS_PATH = os.path.join(CACHE_PATH, "temp-results")
22
  GT_PATH = os.path.join(CACHE_PATH, "gt")
23
+ GROUPS_INFO_PATH = os.path.join(CACHE_PATH, "groups-info")
24
+
25
+ REQUIRED_COLUMNS = ['accommodation_id', 'user_id'] + [f'review_{i}' for i in range(1, 11)]
26
+
27
 
28
  API = hh.HfApi(token=TOKEN)
29
 
 
32
  API.restart_space(repo_id=REPO_ID)
33
 
34
 
35
+ # download the GT - shouldn't update too frequent
 
 
 
 
36
  hh.snapshot_download(
37
  repo_id=GT_REPO, local_dir=GT_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
38
  token=TOKEN
39
  )
40
 
41
 
42
+ def refresh_data():
43
+ hh.snapshot_download(
44
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
45
+ token=TOKEN
46
+ )
47
+
48
+ hh.snapshot_download(
49
+ repo_id=GROUPS_INFO_REPO, local_dir=GROUPS_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
50
+ token=TOKEN
51
+ )
52
+
53
+
54
+ refresh_data()
55
+
56
+
57
+ def calculate_metrics(df_pred):
58
+ df_gt = pd.read_csv(os.path.join(GT_PATH, 'val_matches.csv'))
59
+
60
  mrr10 = 0.3
61
  precision10 = 0.2
62
  return mrr10, precision10
63
 
64
+
65
+ def get_group_name_by_email(email):
66
+ df = pd.read_csv(GROUPS_INFO_PATH, 'groups_data.csv')
67
+ df_email = df[df['email'] == email].reset_index(drop=True)
68
+ if len(df_email) > 0:
69
+ return df_email.iloc[0]['group_name']
70
+ else:
71
+ raise Exception("E-mail is not valid")
72
+
73
+
74
+ def validate_pred_file(df_pred):
75
+ for col in REQUIRED_COLUMNS:
76
+ if col not in df_pred.columns:
77
+ raise Exception(f"Column {col} not in prediction file")
78
+
79
+
80
+ def get_revision(df_results, email):
81
+ df_group_data = df_results[df_results['email'] == email]
82
+ if len(df_group_data) > 0:
83
+ return df_group_data['revision'].max()
84
+ else:
85
+ return 0
86
+
87
+
88
+ def get_results_dataframe():
89
+ dfs = []
90
+ for f in os.listdir(EVAL_RESULTS_PATH):
91
+ if f.endswith('.csv'):
92
+ dfs.append(pd.read_csv(os.path.join(EVAL_RESULTS_PATH, f)))
93
+ return pd.concat(dfs)
94
+
95
+
96
+ def upload_results(group_email, group_name, model_name, revision, mrr10, precision10):
97
+ submission_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
98
+
99
+ if not os.path.exists(TEMP_RESULTS_PATH):
100
+ os.mkdir(TEMP_RESULTS_PATH)
101
+
102
+ df_temp_results = pd.DataFrame({'email': [group_email], 'group_name': [group_name], model_name: [model_name],
103
+ "submission_date": [submission_date], "revision": [revision], "MRR10": [mrr10],
104
+ "PRECISION10": [precision10]})
105
+ temp_path = os.path.join(TEMP_RESULTS_PATH, str(uuid.uuid4()) + '.csv')
106
+ df_temp_results.to_csv(temp_path, index=False)
107
+ hh.upload_file(path_or_fileobj=temp_path, repo_id=RESULTS_REPO, token=TOKEN, repo_type="dataset")
108
+
109
+
110
  def render():
111
  st.set_page_config(page_title="RecTour2024 - Booking.com Review Ranking Challenge Leaderboard", layout="wide")
112
  st.title("🏆 RecTour2024 Leaderboard")
 
115
 
116
  # leaderboard area
117
  if leaderboard_tab.button("Refresh"):
118
+ refresh_data()
 
 
 
119
 
120
+ df_results = get_results_dataframe()
121
+ leaderboard_tab.dataframe(df_results.drop(columns=['email']))
122
 
123
  # submission area
124
+ group_email = submission_tab.text_input(label="Group email", value="")
125
  model_name = submission_tab.text_input(label="Model name", value="")
126
+ pred_file = submission_tab.file_uploader(label="Upload your prediction file",
127
+ help="Upload a csv.zip file, in pandas this can be achieved "
128
+ "with df.to_csv(<file_path>, compression='zip')",)
129
  if submission_tab.button("Upload"):
130
  if not pred_file:
131
  submission_tab.markdown("no file was submitted!")
132
  else:
133
+ try:
134
+ group_name = get_group_name_by_email(group_email)
135
+ df_pred = pd.read_csv(pred_file, compression='zip')
136
+ validate_pred_file(df_pred)
137
+ mrr10, precision10 = calculate_metrics(df_pred)
138
+ revision = get_revision(df_results=df_results, email=group_email) + 1 # generate next revision id
139
+ upload_results(group_email=group_email, group_name=group_name, model_name=model_name, revision=revision,
140
+ mrr10=mrr10, precision10=precision10)
141
+
142
+ submission_tab.markdown("## THANK YOU FOR YOUR SUBMISSION!")
143
+ submission_tab.markdown("Here are your submission details:")
144
+ submission_tab.markdown("**Group name:** " + group_name)
145
+ submission_tab.markdown("**Model name:** " + model_name)
146
+ submission_tab.markdown("**Revision:** " + revision + "(out of {MAX_SUBMISSIONS} allowed submissions)")
147
+
148
+ submission_tab.write("### Submission results")
149
+ submission_tab.markdown("**MRR@10:** {:.4f}".format(mrr10))
150
+ submission_tab.markdown("**Precision@10:** {:.4f}".format(precision10))
151
+ except Exception as e:
152
+ submission_tab.markdown(e)
153
 
154
 
155
  if __name__ == "__main__":