Spaces:
Sleeping
Sleeping
Terry Zhuo
commited on
Commit
•
7eeb535
1
Parent(s):
3204d18
update
Browse files
app.py
CHANGED
@@ -134,117 +134,114 @@ def evaluate(
|
|
134 |
gt_pass_rate = np.mean([1 if v is not None else 0 for k, v in expected_time.items() if k in problems])
|
135 |
failed_tasks = [k for k, v in expected_time.items() if v is None and k in problems]
|
136 |
|
137 |
-
|
138 |
-
if gt_pass_rate > 0.99:
|
139 |
-
cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}", "green")
|
140 |
-
else:
|
141 |
-
cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}\nPlease be cautious!", "red")
|
142 |
-
if len(failed_tasks) > 0:
|
143 |
-
cprint(f"Failed tasks: {failed_tasks}", "red")
|
144 |
-
return {"gt_pass_rate":float(gt_pass_rate), "failed_tasks": failed_tasks}
|
145 |
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
|
|
|
|
150 |
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
)
|
166 |
-
|
167 |
-
|
168 |
-
sample["
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
min_time_limit,
|
184 |
-
expected_time[task_id] if expected_time[task_id] else 20
|
185 |
-
)
|
186 |
-
futures.append(executor.submit(check_correctness, *args))
|
187 |
-
completion_id[task_id] += 1
|
188 |
-
n_samples += 1
|
189 |
|
190 |
-
|
191 |
-
|
192 |
|
193 |
-
|
194 |
-
not_done = futures
|
195 |
-
while len(not_done) > 0:
|
196 |
-
|
197 |
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
|
202 |
-
|
203 |
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
|
209 |
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
|
235 |
-
|
236 |
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
|
|
242 |
pass_at_k["model"] = os.path.basename(samples).split("--bigcodebench-")[0]
|
243 |
pass_at_k["split"] = split
|
244 |
pass_at_k["subset"] = subset
|
245 |
pass_at_k["calibrated"] = "sanitized-calibrated" in samples
|
246 |
pass_at_k["gt_pass_rate"] = gt_pass_rate
|
247 |
pass_at_k["failed_tasks"] = failed_tasks
|
|
|
248 |
return results, pass_at_k
|
249 |
|
250 |
|
@@ -252,8 +249,8 @@ def run_gradio():
|
|
252 |
interface = gr.Interface(
|
253 |
fn=evaluate,
|
254 |
inputs=[
|
255 |
-
gr.Dropdown(["complete", "instruct"], label="Split"),
|
256 |
-
gr.Dropdown(["full", "hard"], label="Subset"),
|
257 |
gr.File(label="Samples Path (.jsonl)"),
|
258 |
gr.Textbox(label="Pass k Values (comma-separated)", value="1,5,10"),
|
259 |
gr.Slider(1, multiprocessing.cpu_count(), step=1, label="Parallel Workers"),
|
|
|
134 |
gt_pass_rate = np.mean([1 if v is not None else 0 for k, v in expected_time.items() if k in problems])
|
135 |
failed_tasks = [k for k, v in expected_time.items() if v is None and k in problems]
|
136 |
|
137 |
+
pass_at_k = dict()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
|
139 |
+
if not check_gt_only:
|
140 |
+
|
141 |
+
results = {
|
142 |
+
"date": datetime.now().strftime("%Y-%m-%d %H:%M"),
|
143 |
+
"eval": {},
|
144 |
+
}
|
145 |
|
146 |
+
with ProcessPoolExecutor(max_workers=n_workers) as executor:
|
147 |
+
futures = []
|
148 |
+
completion_id = Counter()
|
149 |
+
n_samples = 0
|
150 |
+
eval_results = defaultdict(list) # task_id ->
|
151 |
+
remainings = set()
|
152 |
|
153 |
+
print("Reading samples...")
|
154 |
+
for sample in tqdm(load_solutions(samples)):
|
155 |
+
task_id = sample["task_id"]
|
156 |
+
|
157 |
+
if task_id not in problems:
|
158 |
+
warn(
|
159 |
+
f"Task {task_id} is found in the samples but not found in the dataset"
|
160 |
+
)
|
161 |
+
continue
|
162 |
+
solution = (
|
163 |
+
sample["solution"]
|
164 |
+
if "solution" in sample
|
165 |
+
else problems[task_id]["complete_prompt"] + sample["completion"]
|
166 |
)
|
167 |
+
if "sanitized-calibrated" in samples:
|
168 |
+
solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution
|
169 |
+
remainings.add(sample["_identifier"])
|
170 |
+
args = (
|
171 |
+
completion_id[task_id],
|
172 |
+
problems[task_id],
|
173 |
+
solution,
|
174 |
+
max_as_limit,
|
175 |
+
max_data_limit,
|
176 |
+
max_stack_limit,
|
177 |
+
sample["_identifier"],
|
178 |
+
min_time_limit,
|
179 |
+
expected_time[task_id] if expected_time[task_id] else 20
|
180 |
+
)
|
181 |
+
futures.append(executor.submit(check_correctness, *args))
|
182 |
+
completion_id[task_id] += 1
|
183 |
+
n_samples += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
|
185 |
+
assert n_samples == len(remainings), "Missing problems in unfinished"
|
186 |
+
assert len(completion_id) == len(problems), "Missing problems in samples"
|
187 |
|
188 |
+
# def stucking_checker():
|
189 |
+
# not_done = futures
|
190 |
+
# while len(not_done) > 0:
|
191 |
+
# done, not_done = wait(not_done, timeout=240, return_when=FIRST_COMPLETED)
|
192 |
|
193 |
+
# if len(done) == 0:
|
194 |
+
# warn("No samples have finished testing in the last 240s")
|
195 |
+
# warn(f"{len(remainings)} samples to be tested: {remainings}")
|
196 |
|
197 |
+
# threading.Thread(target=stucking_checker).start()
|
198 |
|
199 |
+
for future in tqdm(as_completed(futures), total=n_samples):
|
200 |
+
result = future.result()
|
201 |
+
remainings.remove(result["_identifier"])
|
202 |
+
eval_results[result["task_id"]].append(result)
|
203 |
|
204 |
|
205 |
+
# sort the results for each problem by completion_id
|
206 |
+
for task_id, task_results in eval_results.items():
|
207 |
+
task_results.sort(key=lambda x: x["completion_id"])
|
208 |
+
results["eval"][task_id] = []
|
209 |
+
for res in task_results:
|
210 |
+
stat, details = res["base"]
|
211 |
+
results["eval"][task_id].append(
|
212 |
+
{
|
213 |
+
"task_id": task_id,
|
214 |
+
"solution": res["solution"],
|
215 |
+
"status": stat,
|
216 |
+
"details": details,
|
217 |
+
}
|
218 |
+
)
|
219 |
|
220 |
+
# Calculate pass@k.
|
221 |
+
total = np.array([len(r) for k, r in results["eval"].items() if k in problems])
|
222 |
+
base_correct = []
|
223 |
|
224 |
+
for key, res in results["eval"].items():
|
225 |
+
if key not in problems:
|
226 |
+
continue
|
227 |
+
bc = sum([r["status"] == PASS for r in res])
|
228 |
+
base_correct.append(bc)
|
229 |
|
230 |
+
base_correct = np.array(base_correct)
|
231 |
|
232 |
+
pass_at_k.update({
|
233 |
+
f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean()
|
234 |
+
for k in pass_k
|
235 |
+
if total.min() >= k
|
236 |
+
})
|
237 |
+
|
238 |
pass_at_k["model"] = os.path.basename(samples).split("--bigcodebench-")[0]
|
239 |
pass_at_k["split"] = split
|
240 |
pass_at_k["subset"] = subset
|
241 |
pass_at_k["calibrated"] = "sanitized-calibrated" in samples
|
242 |
pass_at_k["gt_pass_rate"] = gt_pass_rate
|
243 |
pass_at_k["failed_tasks"] = failed_tasks
|
244 |
+
|
245 |
return results, pass_at_k
|
246 |
|
247 |
|
|
|
249 |
interface = gr.Interface(
|
250 |
fn=evaluate,
|
251 |
inputs=[
|
252 |
+
gr.Dropdown(["complete", "instruct"], label="BigCodeBench Split"),
|
253 |
+
gr.Dropdown(["full", "hard"], label="BigCodeBench Subset"),
|
254 |
gr.File(label="Samples Path (.jsonl)"),
|
255 |
gr.Textbox(label="Pass k Values (comma-separated)", value="1,5,10"),
|
256 |
gr.Slider(1, multiprocessing.cpu_count(), step=1, label="Parallel Workers"),
|