Start rendering actual data + minor improvements
Browse files- app.py +39 -15
- requirements.txt +1 -0
- src/content.py +3 -1
- src/get_results_for_task.py +29 -1
- src/leaderboard_formatting.py +39 -0
- src/submission_uploader.py +8 -2
- src/tasks.py +1 -1
app.py
CHANGED
@@ -1,19 +1,41 @@
|
|
|
|
1 |
import os
|
2 |
|
3 |
import gradio as gr # type: ignore[import]
|
|
|
|
|
4 |
|
5 |
-
from src.content import (
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
from src.submission_uploader import SubmissionUploader
|
12 |
from src.tasks import TASKS_DESCRIPTIONS, TASKS_PRETTY, TASKS_PRETTY_REVERSE
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
submission_uploader = SubmissionUploader(os.environ["DATASET_ID"])
|
15 |
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
with gr.Blocks() as demo:
|
18 |
gr.HTML(INTRODUCTION_TITLE)
|
19 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
@@ -28,7 +50,7 @@ with gr.Blocks() as demo:
|
|
28 |
gr.Markdown(TASKS_DESCRIPTIONS[task])
|
29 |
|
30 |
leaderboard_table = gr.components.Dataframe(
|
31 |
-
value=
|
32 |
)
|
33 |
|
34 |
gr.HTML(SUBMISSION_TITLE)
|
@@ -55,26 +77,25 @@ with gr.Blocks() as demo:
|
|
55 |
)
|
56 |
context_size_textbox = gr.Textbox(
|
57 |
label="Context Size",
|
58 |
-
placeholder="Context size
|
59 |
)
|
60 |
with gr.Column():
|
61 |
submitted_by_textbox = gr.Textbox(
|
62 |
label="Submitted By",
|
63 |
-
placeholder="
|
|
|
|
|
|
|
|
|
64 |
)
|
65 |
contact_textbox = gr.Textbox(
|
66 |
label="Contact Information",
|
67 |
-
placeholder="How Long Code Arena team can contact you
|
68 |
)
|
69 |
comment_textbox = gr.Textbox(
|
70 |
label="Comment",
|
71 |
placeholder="Any comments you have for Long Code Arena team (optional, won't go to public dataset).",
|
72 |
)
|
73 |
-
url_textbox = gr.Textbox(
|
74 |
-
label="Relevant URLs",
|
75 |
-
placeholder="URLs to relevant resources (preprint/blogpost/code/etc.) with "
|
76 |
-
"additional details about your submission.",
|
77 |
-
)
|
78 |
|
79 |
gr.Markdown(SUBMISSION_TEXT_FILES, elem_classes="markdown-text")
|
80 |
file_output = gr.File(file_count="multiple")
|
@@ -98,4 +119,7 @@ with gr.Blocks() as demo:
|
|
98 |
)
|
99 |
|
100 |
if __name__ == "__main__":
|
|
|
|
|
|
|
101 |
demo.launch()
|
|
|
1 |
+
import logging
|
2 |
import os
|
3 |
|
4 |
import gradio as gr # type: ignore[import]
|
5 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
6 |
+
from huggingface_hub import HfApi
|
7 |
|
8 |
+
from src.content import (
|
9 |
+
INTRODUCTION_TEXT,
|
10 |
+
INTRODUCTION_TITLE,
|
11 |
+
LEADERBOARD_TEXT,
|
12 |
+
LEADERBOARD_TITLE,
|
13 |
+
SUBMISSION_TEXT_FILES,
|
14 |
+
SUBMISSION_TEXT_INTRO,
|
15 |
+
SUBMISSION_TEXT_METADATA,
|
16 |
+
SUBMISSION_TEXT_SUBMIT,
|
17 |
+
SUBMISSION_TEXT_TASK,
|
18 |
+
SUBMISSION_TITLE,
|
19 |
+
)
|
20 |
+
from src.get_results_for_task import get_results_for_task
|
21 |
from src.submission_uploader import SubmissionUploader
|
22 |
from src.tasks import TASKS_DESCRIPTIONS, TASKS_PRETTY, TASKS_PRETTY_REVERSE
|
23 |
|
24 |
+
logging.basicConfig(
|
25 |
+
level=logging.INFO,
|
26 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
27 |
+
handlers=[logging.StreamHandler()],
|
28 |
+
)
|
29 |
+
|
30 |
submission_uploader = SubmissionUploader(os.environ["DATASET_ID"])
|
31 |
|
32 |
|
33 |
+
def restart_space():
|
34 |
+
HfApi(token=os.environ["HF_TOKEN"]).restart_space(
|
35 |
+
repo_id="JetBrains-Research/long-code-arena"
|
36 |
+
)
|
37 |
+
|
38 |
+
|
39 |
with gr.Blocks() as demo:
|
40 |
gr.HTML(INTRODUCTION_TITLE)
|
41 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
|
|
50 |
gr.Markdown(TASKS_DESCRIPTIONS[task])
|
51 |
|
52 |
leaderboard_table = gr.components.Dataframe(
|
53 |
+
value=get_results_for_task(task), interactive=False
|
54 |
)
|
55 |
|
56 |
gr.HTML(SUBMISSION_TITLE)
|
|
|
77 |
)
|
78 |
context_size_textbox = gr.Textbox(
|
79 |
label="Context Size",
|
80 |
+
placeholder="Context size in tokens used for the submission (should be an integer).",
|
81 |
)
|
82 |
with gr.Column():
|
83 |
submitted_by_textbox = gr.Textbox(
|
84 |
label="Submitted By",
|
85 |
+
placeholder="How to display on the leaderboard who submitted the model.",
|
86 |
+
)
|
87 |
+
url_textbox = gr.Textbox(
|
88 |
+
label="Relevant URLs",
|
89 |
+
placeholder="URLs to relevant resources with additional details about your submission (optional).",
|
90 |
)
|
91 |
contact_textbox = gr.Textbox(
|
92 |
label="Contact Information",
|
93 |
+
placeholder="How Long Code Arena team can contact you (won't go to public dataset).",
|
94 |
)
|
95 |
comment_textbox = gr.Textbox(
|
96 |
label="Comment",
|
97 |
placeholder="Any comments you have for Long Code Arena team (optional, won't go to public dataset).",
|
98 |
)
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
gr.Markdown(SUBMISSION_TEXT_FILES, elem_classes="markdown-text")
|
101 |
file_output = gr.File(file_count="multiple")
|
|
|
119 |
)
|
120 |
|
121 |
if __name__ == "__main__":
|
122 |
+
scheduler = BackgroundScheduler()
|
123 |
+
scheduler.add_job(restart_space, "interval", seconds=30 * 60)
|
124 |
+
scheduler.start()
|
125 |
demo.launch()
|
requirements.txt
CHANGED
@@ -2,6 +2,7 @@ huggingface_hub
|
|
2 |
jsonlines
|
3 |
pandas
|
4 |
tqdm
|
|
|
5 |
# CMG metrics
|
6 |
evaluate
|
7 |
bert-score
|
|
|
2 |
jsonlines
|
3 |
pandas
|
4 |
tqdm
|
5 |
+
apscheduler
|
6 |
# CMG metrics
|
7 |
evaluate
|
8 |
bert-score
|
src/content.py
CHANGED
@@ -28,4 +28,6 @@ SUBMISSION_TEXT_FILES = """3. Attach one or more files with your model's predict
|
|
28 |
* If several files are attached, they will be treated as separate runs of the submitted model (e.g., with different seeds), and the metrics will be averaged across runs. For baselines provided by ποΈ Long Code Arena Team, the results are averaged across 3 runs.
|
29 |
* Please, attach files in [JSONLines format](https://jsonlines.org/). For an example, check the predictions provided by ποΈ Long Code Arena Team in π€ [JetBrains-Research/lca-results](https://huggingface.co/datasets/JetBrains-Research/lca-results). Make sure to include `"prediction"` and `"reference"` fields for each example, the rest are optional.
|
30 |
"""
|
31 |
-
SUBMISSION_TEXT_SUBMIT = """All set! A new PR to π€ [JetBrains-Research/lca-results](https://huggingface.co/datasets/JetBrains-Research/lca-results) should be opened when you press "Submit" button. ποΈ Long Code Arena Team will review it shortly, and the results will appear in the leaderboard.
|
|
|
|
|
|
28 |
* If several files are attached, they will be treated as separate runs of the submitted model (e.g., with different seeds), and the metrics will be averaged across runs. For baselines provided by ποΈ Long Code Arena Team, the results are averaged across 3 runs.
|
29 |
* Please, attach files in [JSONLines format](https://jsonlines.org/). For an example, check the predictions provided by ποΈ Long Code Arena Team in π€ [JetBrains-Research/lca-results](https://huggingface.co/datasets/JetBrains-Research/lca-results). Make sure to include `"prediction"` and `"reference"` fields for each example, the rest are optional.
|
30 |
"""
|
31 |
+
SUBMISSION_TEXT_SUBMIT = """All set! A new PR to π€ [JetBrains-Research/lca-results](https://huggingface.co/datasets/JetBrains-Research/lca-results) should be opened when you press "Submit" button. ποΈ Long Code Arena Team will review it shortly, and the results will appear in the leaderboard.
|
32 |
+
|
33 |
+
β³ **Note:** It might take some time (up to 40 minutes) for PR to get created, since it involves computing metrics for your submission."""
|
src/get_results_for_task.py
CHANGED
@@ -1,7 +1,15 @@
|
|
|
|
|
|
|
|
1 |
import pandas as pd # type: ignore[import]
|
|
|
|
|
|
|
2 |
|
|
|
3 |
|
4 |
-
|
|
|
5 |
stub_df = pd.DataFrame(
|
6 |
[
|
7 |
{
|
@@ -29,3 +37,23 @@ def get_results_for_task_stub(task: str) -> pd.DataFrame:
|
|
29 |
]
|
30 |
)
|
31 |
return stub_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
|
4 |
import pandas as pd # type: ignore[import]
|
5 |
+
from datasets import get_dataset_config_names, load_dataset # type: ignore[import]
|
6 |
+
|
7 |
+
from .leaderboard_formatting import COLUMNS_PRETTY, get_columns_per_task
|
8 |
|
9 |
+
AVAILABLE_TASKS = get_dataset_config_names(os.environ["DATASET_ID"])
|
10 |
|
11 |
+
|
12 |
+
def _get_results_stub() -> pd.DataFrame:
|
13 |
stub_df = pd.DataFrame(
|
14 |
[
|
15 |
{
|
|
|
37 |
]
|
38 |
)
|
39 |
return stub_df
|
40 |
+
|
41 |
+
|
42 |
+
def _get_results_dataset(task_id: str) -> pd.DataFrame:
|
43 |
+
results_df = load_dataset(
|
44 |
+
os.environ["DATASET_ID"], task_id, split="test"
|
45 |
+
).to_pandas()
|
46 |
+
results_df = results_df.rename(columns=COLUMNS_PRETTY, errors="ignore")
|
47 |
+
results_df["Context Size"] = results_df["Context Size"].map(
|
48 |
+
lambda x: f"{int(x) // 1000}k" if int(x) >= 1000 else x
|
49 |
+
)
|
50 |
+
results_df = results_df[get_columns_per_task(task_id)]
|
51 |
+
return results_df
|
52 |
+
|
53 |
+
|
54 |
+
def get_results_for_task(task_id: str) -> pd.DataFrame:
|
55 |
+
if task_id in AVAILABLE_TASKS:
|
56 |
+
logging.info(f"Retrieving results for {task_id}...")
|
57 |
+
return _get_results_dataset(task_id)
|
58 |
+
logging.info(f"Generating leaderboard stub for {task_id}...")
|
59 |
+
return _get_results_stub()
|
src/leaderboard_formatting.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
|
3 |
+
COLUMNS_PRETTY = {
|
4 |
+
"bleu": "BLEU",
|
5 |
+
"chrf": "ChrF",
|
6 |
+
"rouge1": "ROUGE-1",
|
7 |
+
"rouge2": "ROUGE-2",
|
8 |
+
"rougeL": "ROUGE-L",
|
9 |
+
"bertscore": "BERTScore",
|
10 |
+
"bertscore_normalized": "BERTScore (Normalized)",
|
11 |
+
"model_name": "Model",
|
12 |
+
"model_availability": "Availability",
|
13 |
+
"urls": "URLs",
|
14 |
+
"context_size": "Context Size",
|
15 |
+
"submitted_by": "Submitted By",
|
16 |
+
}
|
17 |
+
|
18 |
+
|
19 |
+
METRICS_PER_TASK = {
|
20 |
+
"commit_message_generation": [
|
21 |
+
"BLEU",
|
22 |
+
"ChrF",
|
23 |
+
"ROUGE-1",
|
24 |
+
"ROUGE-2",
|
25 |
+
"ROUGE-L",
|
26 |
+
"BERTScore",
|
27 |
+
"BERTScore (Normalized)",
|
28 |
+
]
|
29 |
+
}
|
30 |
+
|
31 |
+
|
32 |
+
def get_columns_per_task(task_id: str) -> List[str]:
|
33 |
+
metrics_per_task = METRICS_PER_TASK[task_id]
|
34 |
+
|
35 |
+
return (
|
36 |
+
["Model Name", "Availability", "Context Size"]
|
37 |
+
+ metrics_per_task
|
38 |
+
+ ["Submitted By", "URLs"]
|
39 |
+
)
|
src/submission_uploader.py
CHANGED
@@ -156,6 +156,7 @@ class SubmissionUploader:
|
|
156 |
|
157 |
def _verify_arguments(
|
158 |
self,
|
|
|
159 |
model_folder: str,
|
160 |
model_name_pretty: str,
|
161 |
model_availability: str,
|
@@ -164,6 +165,9 @@ class SubmissionUploader:
|
|
164 |
submitted_by: str,
|
165 |
filenames: Optional[List[str]],
|
166 |
):
|
|
|
|
|
|
|
167 |
assert (
|
168 |
model_folder
|
169 |
), "Please, specify non-empty name for a directory with a model's results."
|
@@ -200,6 +204,7 @@ class SubmissionUploader:
|
|
200 |
) -> str:
|
201 |
try:
|
202 |
self._verify_arguments(
|
|
|
203 |
model_folder=model_folder,
|
204 |
model_name_pretty=model_name_pretty,
|
205 |
model_availability=model_availability,
|
@@ -208,12 +213,13 @@ class SubmissionUploader:
|
|
208 |
submitted_by=submitted_by,
|
209 |
filenames=filenames,
|
210 |
)
|
211 |
-
|
212 |
pr_title = f"π New submission to {task_pretty} task: {model_name_pretty} with {context_size} context size from {submitted_by}"
|
213 |
|
|
|
|
|
214 |
task_id = TASKS_PRETTY_REVERSE[task_pretty]
|
215 |
|
216 |
-
logging.info("Checking if this request
|
217 |
if not force:
|
218 |
if model_name_pretty in self._fs.ls(
|
219 |
f"datasets/{self._dataset_id}/{task_id}/predictions"
|
|
|
156 |
|
157 |
def _verify_arguments(
|
158 |
self,
|
159 |
+
task_pretty: str,
|
160 |
model_folder: str,
|
161 |
model_name_pretty: str,
|
162 |
model_availability: str,
|
|
|
165 |
submitted_by: str,
|
166 |
filenames: Optional[List[str]],
|
167 |
):
|
168 |
+
assert (
|
169 |
+
task_pretty and task_pretty in TASKS_PRETTY_REVERSE
|
170 |
+
), "Please, select one of the supported tasks."
|
171 |
assert (
|
172 |
model_folder
|
173 |
), "Please, specify non-empty name for a directory with a model's results."
|
|
|
204 |
) -> str:
|
205 |
try:
|
206 |
self._verify_arguments(
|
207 |
+
task_pretty=task_pretty,
|
208 |
model_folder=model_folder,
|
209 |
model_name_pretty=model_name_pretty,
|
210 |
model_availability=model_availability,
|
|
|
213 |
submitted_by=submitted_by,
|
214 |
filenames=filenames,
|
215 |
)
|
|
|
216 |
pr_title = f"π New submission to {task_pretty} task: {model_name_pretty} with {context_size} context size from {submitted_by}"
|
217 |
|
218 |
+
logging.info(f"Start processing {pr_title}")
|
219 |
+
|
220 |
task_id = TASKS_PRETTY_REVERSE[task_pretty]
|
221 |
|
222 |
+
logging.info("Checking if this request has already been submitted...")
|
223 |
if not force:
|
224 |
if model_name_pretty in self._fs.ls(
|
225 |
f"datasets/{self._dataset_id}/{task_id}/predictions"
|
src/tasks.py
CHANGED
@@ -17,7 +17,7 @@ TASKS_DESCRIPTIONS = {
|
|
17 |
* [BLEU](https://huggingface.co/spaces/evaluate-metric/sacrebleu)
|
18 |
* [ROUGE](https://huggingface.co/spaces/evaluate-metric/rouge)
|
19 |
* [ChrF](https://huggingface.co/spaces/evaluate-metric/chrf)
|
20 |
-
* [BERTScore](https://huggingface.co/spaces/evaluate-metric/
|
21 |
|
22 |
For further details on the dataset and the baselines from ποΈ Long Code Arena Team, refer to `commit_message_generation` folder in [our baselines repository](https://github.com/JetBrains-Research/lca-baselines) or to our preprint (TODO).
|
23 |
""",
|
|
|
17 |
* [BLEU](https://huggingface.co/spaces/evaluate-metric/sacrebleu)
|
18 |
* [ROUGE](https://huggingface.co/spaces/evaluate-metric/rouge)
|
19 |
* [ChrF](https://huggingface.co/spaces/evaluate-metric/chrf)
|
20 |
+
* [BERTScore](https://huggingface.co/spaces/evaluate-metric/bertscore)
|
21 |
|
22 |
For further details on the dataset and the baselines from ποΈ Long Code Arena Team, refer to `commit_message_generation` folder in [our baselines repository](https://github.com/JetBrains-Research/lca-baselines) or to our preprint (TODO).
|
23 |
""",
|