Update app.py
Browse files
app.py
CHANGED
@@ -68,69 +68,24 @@ def load_line_json_data(filename):
|
|
68 |
def add_new_eval(
|
69 |
val_or_test: str,
|
70 |
eval_mode: str,
|
71 |
-
model: str,
|
72 |
-
tooluse_strategy: str,
|
73 |
-
planning_strategy: str,
|
74 |
-
organization: str,
|
75 |
-
mail: str,
|
76 |
path_to_file: str,
|
77 |
):
|
78 |
-
# Very basic email parsing
|
79 |
-
_, parsed_mail = parseaddr(mail)
|
80 |
-
if not "@" in parsed_mail:
|
81 |
-
return format_warning("Please provide a valid email adress.")
|
82 |
-
|
83 |
print("Adding new eval")
|
84 |
|
85 |
if path_to_file is None:
|
86 |
return format_warning("Please attach a file.")
|
87 |
|
88 |
-
# Save submitted file
|
89 |
-
api.upload_file(
|
90 |
-
repo_id=RESULTS_DATASET,
|
91 |
-
path_or_fileobj=path_to_file.name,
|
92 |
-
path_in_repo=f"{organization}/{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}_raw_{datetime.datetime.today()}.jsonl",
|
93 |
-
repo_type="dataset",
|
94 |
-
token=TOKEN
|
95 |
-
)
|
96 |
|
97 |
# Compute score
|
98 |
file_path = path_to_file.name
|
99 |
-
result = eval_score(val_or_test,file_path=file_path,TOKEN=TOKEN)
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
repo_id=RESULTS_DATASET,
|
106 |
-
path_or_fileobj=f"scored/{organization}_{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}.jsonl",
|
107 |
-
path_in_repo=f"{organization}/{model}/{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}_scored_{datetime.datetime.today()}.jsonl",
|
108 |
-
repo_type="dataset",
|
109 |
-
token=TOKEN
|
110 |
-
)
|
111 |
-
|
112 |
-
# Actual submission
|
113 |
-
eval_entry = {
|
114 |
-
"Model": model,
|
115 |
-
"Tool-use Strategy": tooluse_strategy,
|
116 |
-
"Planning Strategy": planning_strategy,
|
117 |
-
"Organization": organization,
|
118 |
-
"Mail": mail,
|
119 |
-
"Delivery Rate": result['Delivery Rate'],
|
120 |
-
"Commonsense Constraint Micro Pass Rate":result['Commonsense Constraint Micro Pass Rate'],
|
121 |
-
"Commonsense Constraint Macro Pass Rate":result['Commonsense Constraint Macro Pass Rate'],
|
122 |
-
"Hard Constraint Micro Pass Rate":result['Hard Constraint Micro Pass Rate'],
|
123 |
-
"Hard Constraint Macro Pass Rate":result['Hard Constraint Macro Pass Rate'],
|
124 |
-
"Final Pass Rate":result['Final Pass Rate']
|
125 |
-
}
|
126 |
-
eval_mode = eval_mode.replace('-','')
|
127 |
-
eval_results[f'{val_or_test}_{eval_mode}'] = eval_results[f'{val_or_test}_{eval_mode}'].add_item(eval_entry)
|
128 |
|
129 |
-
|
130 |
-
|
131 |
-
eval_results.push_to_hub(RESULTS_DATASET, config_name = 'scores', token=TOKEN)
|
132 |
-
|
133 |
-
return format_log(f"Model: {model} | Tool-use Strategy: {tooluse_strategy} | Planning Strategy: {planning_strategy} | submitted by {organization} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed (Validation ~2mins, Test ~7mins).")
|
134 |
|
135 |
|
136 |
def refresh():
|
@@ -141,9 +96,6 @@ def refresh():
|
|
141 |
eval_dataframe_test_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="test",mode='soleplanning')
|
142 |
return eval_dataframe_val_twostage, eval_dataframe_val_soleplanning, eval_dataframe_test_twostage, eval_dataframe_test_soleplanning
|
143 |
|
144 |
-
# def upload_file(files):
|
145 |
-
# file_paths = [file.name for file in files]
|
146 |
-
# return file_paths
|
147 |
|
148 |
|
149 |
demo = gr.Blocks()
|
@@ -185,14 +137,8 @@ with demo:
|
|
185 |
with gr.Column():
|
186 |
level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
|
187 |
eval_mode = gr.Radio(["two-stage", "sole-planning"], value="two-stage", label="Eval Mode")
|
188 |
-
|
189 |
-
|
190 |
-
planning_strategy = gr.Textbox(label="Planning Strategy")
|
191 |
-
with gr.Column():
|
192 |
-
organization = gr.Textbox(label="Organization")
|
193 |
-
mail = gr.Textbox(label="Contact email")
|
194 |
-
file_output = gr.File()
|
195 |
-
|
196 |
|
197 |
submit_button = gr.Button("Submit Eval")
|
198 |
submission_result = gr.Markdown()
|
@@ -201,16 +147,12 @@ with demo:
|
|
201 |
[
|
202 |
level_of_test,
|
203 |
eval_mode,
|
204 |
-
|
205 |
-
tooluse_strategy,
|
206 |
-
planning_strategy,
|
207 |
-
organization,
|
208 |
-
mail,
|
209 |
-
file_output,
|
210 |
],
|
211 |
-
submission_result,
|
212 |
)
|
213 |
|
214 |
demo.launch(debug=True)
|
215 |
|
216 |
|
|
|
|
68 |
def add_new_eval(
|
69 |
val_or_test: str,
|
70 |
eval_mode: str,
|
|
|
|
|
|
|
|
|
|
|
71 |
path_to_file: str,
|
72 |
):
|
|
|
|
|
|
|
|
|
|
|
73 |
print("Adding new eval")
|
74 |
|
75 |
if path_to_file is None:
|
76 |
return format_warning("Please attach a file.")
|
77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
# Compute score
|
80 |
file_path = path_to_file.name
|
81 |
+
result, detail_json = eval_score(val_or_test,file_path=file_path,TOKEN=TOKEN)
|
82 |
+
print(detail_json)
|
83 |
+
print(type(detail_json))
|
84 |
+
outputPath=os.path.join('.',datetime.now().strftime('%Y%m%d%H%M%S') + '.json')
|
85 |
+
with open(outputPath,'w') as w:
|
86 |
+
json.dump(detail_json,w)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
+
return format_log(f"{result}"), gr.File(label=f"Download the detailed constraint pass rate reports", value=outputPath, visible=True)
|
|
|
|
|
|
|
|
|
89 |
|
90 |
|
91 |
def refresh():
|
|
|
96 |
eval_dataframe_test_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="test",mode='soleplanning')
|
97 |
return eval_dataframe_val_twostage, eval_dataframe_val_soleplanning, eval_dataframe_test_twostage, eval_dataframe_test_soleplanning
|
98 |
|
|
|
|
|
|
|
99 |
|
100 |
|
101 |
demo = gr.Blocks()
|
|
|
137 |
with gr.Column():
|
138 |
level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
|
139 |
eval_mode = gr.Radio(["two-stage", "sole-planning"], value="two-stage", label="Eval Mode")
|
140 |
+
file_input = gr.File(label="Upload file")
|
141 |
+
file_output = gr.File(label="Download the detailed constraint pass rate reports", visible=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
submit_button = gr.Button("Submit Eval")
|
144 |
submission_result = gr.Markdown()
|
|
|
147 |
[
|
148 |
level_of_test,
|
149 |
eval_mode,
|
150 |
+
file_input,
|
|
|
|
|
|
|
|
|
|
|
151 |
],
|
152 |
+
[submission_result, file_output]
|
153 |
)
|
154 |
|
155 |
demo.launch(debug=True)
|
156 |
|
157 |
|
158 |
+
|