Spaces:
Sleeping
Sleeping
Terry Zhuo
commited on
Commit
·
7eeb535
1
Parent(s):
3204d18
update
Browse files
app.py
CHANGED
|
@@ -134,117 +134,114 @@ def evaluate(
|
|
| 134 |
gt_pass_rate = np.mean([1 if v is not None else 0 for k, v in expected_time.items() if k in problems])
|
| 135 |
failed_tasks = [k for k, v in expected_time.items() if v is None and k in problems]
|
| 136 |
|
| 137 |
-
|
| 138 |
-
if gt_pass_rate > 0.99:
|
| 139 |
-
cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}", "green")
|
| 140 |
-
else:
|
| 141 |
-
cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}\nPlease be cautious!", "red")
|
| 142 |
-
if len(failed_tasks) > 0:
|
| 143 |
-
cprint(f"Failed tasks: {failed_tasks}", "red")
|
| 144 |
-
return {"gt_pass_rate":float(gt_pass_rate), "failed_tasks": failed_tasks}
|
| 145 |
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
|
|
|
|
|
|
| 150 |
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
)
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
sample["
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
min_time_limit,
|
| 184 |
-
expected_time[task_id] if expected_time[task_id] else 20
|
| 185 |
-
)
|
| 186 |
-
futures.append(executor.submit(check_correctness, *args))
|
| 187 |
-
completion_id[task_id] += 1
|
| 188 |
-
n_samples += 1
|
| 189 |
|
| 190 |
-
|
| 191 |
-
|
| 192 |
|
| 193 |
-
|
| 194 |
-
not_done = futures
|
| 195 |
-
while len(not_done) > 0:
|
| 196 |
-
|
| 197 |
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
|
| 202 |
-
|
| 203 |
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
|
| 209 |
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
|
| 235 |
-
|
| 236 |
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
|
|
|
| 242 |
pass_at_k["model"] = os.path.basename(samples).split("--bigcodebench-")[0]
|
| 243 |
pass_at_k["split"] = split
|
| 244 |
pass_at_k["subset"] = subset
|
| 245 |
pass_at_k["calibrated"] = "sanitized-calibrated" in samples
|
| 246 |
pass_at_k["gt_pass_rate"] = gt_pass_rate
|
| 247 |
pass_at_k["failed_tasks"] = failed_tasks
|
|
|
|
| 248 |
return results, pass_at_k
|
| 249 |
|
| 250 |
|
|
@@ -252,8 +249,8 @@ def run_gradio():
|
|
| 252 |
interface = gr.Interface(
|
| 253 |
fn=evaluate,
|
| 254 |
inputs=[
|
| 255 |
-
gr.Dropdown(["complete", "instruct"], label="Split"),
|
| 256 |
-
gr.Dropdown(["full", "hard"], label="Subset"),
|
| 257 |
gr.File(label="Samples Path (.jsonl)"),
|
| 258 |
gr.Textbox(label="Pass k Values (comma-separated)", value="1,5,10"),
|
| 259 |
gr.Slider(1, multiprocessing.cpu_count(), step=1, label="Parallel Workers"),
|
|
|
|
| 134 |
gt_pass_rate = np.mean([1 if v is not None else 0 for k, v in expected_time.items() if k in problems])
|
| 135 |
failed_tasks = [k for k, v in expected_time.items() if v is None and k in problems]
|
| 136 |
|
| 137 |
+
pass_at_k = dict()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
|
| 139 |
+
if not check_gt_only:
|
| 140 |
+
|
| 141 |
+
results = {
|
| 142 |
+
"date": datetime.now().strftime("%Y-%m-%d %H:%M"),
|
| 143 |
+
"eval": {},
|
| 144 |
+
}
|
| 145 |
|
| 146 |
+
with ProcessPoolExecutor(max_workers=n_workers) as executor:
|
| 147 |
+
futures = []
|
| 148 |
+
completion_id = Counter()
|
| 149 |
+
n_samples = 0
|
| 150 |
+
eval_results = defaultdict(list) # task_id ->
|
| 151 |
+
remainings = set()
|
| 152 |
|
| 153 |
+
print("Reading samples...")
|
| 154 |
+
for sample in tqdm(load_solutions(samples)):
|
| 155 |
+
task_id = sample["task_id"]
|
| 156 |
+
|
| 157 |
+
if task_id not in problems:
|
| 158 |
+
warn(
|
| 159 |
+
f"Task {task_id} is found in the samples but not found in the dataset"
|
| 160 |
+
)
|
| 161 |
+
continue
|
| 162 |
+
solution = (
|
| 163 |
+
sample["solution"]
|
| 164 |
+
if "solution" in sample
|
| 165 |
+
else problems[task_id]["complete_prompt"] + sample["completion"]
|
| 166 |
)
|
| 167 |
+
if "sanitized-calibrated" in samples:
|
| 168 |
+
solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution
|
| 169 |
+
remainings.add(sample["_identifier"])
|
| 170 |
+
args = (
|
| 171 |
+
completion_id[task_id],
|
| 172 |
+
problems[task_id],
|
| 173 |
+
solution,
|
| 174 |
+
max_as_limit,
|
| 175 |
+
max_data_limit,
|
| 176 |
+
max_stack_limit,
|
| 177 |
+
sample["_identifier"],
|
| 178 |
+
min_time_limit,
|
| 179 |
+
expected_time[task_id] if expected_time[task_id] else 20
|
| 180 |
+
)
|
| 181 |
+
futures.append(executor.submit(check_correctness, *args))
|
| 182 |
+
completion_id[task_id] += 1
|
| 183 |
+
n_samples += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
+
assert n_samples == len(remainings), "Missing problems in unfinished"
|
| 186 |
+
assert len(completion_id) == len(problems), "Missing problems in samples"
|
| 187 |
|
| 188 |
+
# def stucking_checker():
|
| 189 |
+
# not_done = futures
|
| 190 |
+
# while len(not_done) > 0:
|
| 191 |
+
# done, not_done = wait(not_done, timeout=240, return_when=FIRST_COMPLETED)
|
| 192 |
|
| 193 |
+
# if len(done) == 0:
|
| 194 |
+
# warn("No samples have finished testing in the last 240s")
|
| 195 |
+
# warn(f"{len(remainings)} samples to be tested: {remainings}")
|
| 196 |
|
| 197 |
+
# threading.Thread(target=stucking_checker).start()
|
| 198 |
|
| 199 |
+
for future in tqdm(as_completed(futures), total=n_samples):
|
| 200 |
+
result = future.result()
|
| 201 |
+
remainings.remove(result["_identifier"])
|
| 202 |
+
eval_results[result["task_id"]].append(result)
|
| 203 |
|
| 204 |
|
| 205 |
+
# sort the results for each problem by completion_id
|
| 206 |
+
for task_id, task_results in eval_results.items():
|
| 207 |
+
task_results.sort(key=lambda x: x["completion_id"])
|
| 208 |
+
results["eval"][task_id] = []
|
| 209 |
+
for res in task_results:
|
| 210 |
+
stat, details = res["base"]
|
| 211 |
+
results["eval"][task_id].append(
|
| 212 |
+
{
|
| 213 |
+
"task_id": task_id,
|
| 214 |
+
"solution": res["solution"],
|
| 215 |
+
"status": stat,
|
| 216 |
+
"details": details,
|
| 217 |
+
}
|
| 218 |
+
)
|
| 219 |
|
| 220 |
+
# Calculate pass@k.
|
| 221 |
+
total = np.array([len(r) for k, r in results["eval"].items() if k in problems])
|
| 222 |
+
base_correct = []
|
| 223 |
|
| 224 |
+
for key, res in results["eval"].items():
|
| 225 |
+
if key not in problems:
|
| 226 |
+
continue
|
| 227 |
+
bc = sum([r["status"] == PASS for r in res])
|
| 228 |
+
base_correct.append(bc)
|
| 229 |
|
| 230 |
+
base_correct = np.array(base_correct)
|
| 231 |
|
| 232 |
+
pass_at_k.update({
|
| 233 |
+
f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean()
|
| 234 |
+
for k in pass_k
|
| 235 |
+
if total.min() >= k
|
| 236 |
+
})
|
| 237 |
+
|
| 238 |
pass_at_k["model"] = os.path.basename(samples).split("--bigcodebench-")[0]
|
| 239 |
pass_at_k["split"] = split
|
| 240 |
pass_at_k["subset"] = subset
|
| 241 |
pass_at_k["calibrated"] = "sanitized-calibrated" in samples
|
| 242 |
pass_at_k["gt_pass_rate"] = gt_pass_rate
|
| 243 |
pass_at_k["failed_tasks"] = failed_tasks
|
| 244 |
+
|
| 245 |
return results, pass_at_k
|
| 246 |
|
| 247 |
|
|
|
|
| 249 |
interface = gr.Interface(
|
| 250 |
fn=evaluate,
|
| 251 |
inputs=[
|
| 252 |
+
gr.Dropdown(["complete", "instruct"], label="BigCodeBench Split"),
|
| 253 |
+
gr.Dropdown(["full", "hard"], label="BigCodeBench Subset"),
|
| 254 |
gr.File(label="Samples Path (.jsonl)"),
|
| 255 |
gr.Textbox(label="Pass k Values (comma-separated)", value="1,5,10"),
|
| 256 |
gr.Slider(1, multiprocessing.cpu_count(), step=1, label="Parallel Workers"),
|