bigcodebench-evaluator

Sleeping

App Files Files Community

Terry Zhuo commited on Oct 4, 2024

Commit

7eeb535

1 Parent(s): 3204d18

update

Browse files

Files changed (1) hide show

app.py +89 -92

app.py CHANGED Viewed

@@ -134,117 +134,114 @@ def evaluate(
     gt_pass_rate = np.mean([1 if v is not None else 0 for k, v in expected_time.items() if k in problems])
     failed_tasks = [k for k, v in expected_time.items() if v is None and k in problems]
-    if check_gt_only:
-        if gt_pass_rate > 0.99:
-            cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}", "green")
-        else:
-            cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}\nPlease be cautious!", "red")
-        if len(failed_tasks) > 0:
-            cprint(f"Failed tasks: {failed_tasks}", "red")
-        return {"gt_pass_rate":float(gt_pass_rate), "failed_tasks": failed_tasks}
-    results = {
-        "date": datetime.now().strftime("%Y-%m-%d %H:%M"),
-        "eval": {},
-    }
-    with ProcessPoolExecutor(max_workers=n_workers) as executor:
-        futures = []
-        completion_id = Counter()
-        n_samples = 0
-        eval_results = defaultdict(list)  # task_id ->
-        remainings = set()
-        print("Reading samples...")
-        for sample in tqdm(load_solutions(samples)):
-            task_id = sample["task_id"]
-            if task_id not in problems:
-                warn(
-                    f"Task {task_id} is found in the samples but not found in the dataset"
                 )
-                continue
-            solution = (
-                sample["solution"]
-                if "solution" in sample
-                else problems[task_id]["complete_prompt"] + sample["completion"]
-            )
-            if "sanitized-calibrated" in samples:
-                solution = problems[task_id]["code_prompt"] + "\n    pass\n" + solution
-            remainings.add(sample["_identifier"])
-            args = (
-                completion_id[task_id],
-                problems[task_id],
-                solution,
-                max_as_limit,
-                max_data_limit,
-                max_stack_limit,
-                sample["_identifier"],
-                min_time_limit,
-                expected_time[task_id] if expected_time[task_id] else 20
-            )
-            futures.append(executor.submit(check_correctness, *args))
-            completion_id[task_id] += 1
-            n_samples += 1
-        assert n_samples == len(remainings), "Missing problems in unfinished"
-        assert len(completion_id) == len(problems), "Missing problems in samples"
-        def stucking_checker():
-            not_done = futures
-            while len(not_done) > 0:
-                done, not_done = wait(not_done, timeout=240, return_when=FIRST_COMPLETED)
-                if len(done) == 0:
-                    warn("No samples have finished testing in the last 240s")
-                    warn(f"{len(remainings)} samples to be tested: {remainings}")
-        threading.Thread(target=stucking_checker).start()
-        for future in tqdm(as_completed(futures), total=n_samples):
-            result = future.result()
-            remainings.remove(result["_identifier"])
-            eval_results[result["task_id"]].append(result)
-    # sort the results for each problem by completion_id
-    for task_id, task_results in eval_results.items():
-        task_results.sort(key=lambda x: x["completion_id"])
-        results["eval"][task_id] = []
-        for res in task_results:
-            stat, details = res["base"]
-            results["eval"][task_id].append(
-                {
-                    "task_id": task_id,
-                    "solution": res["solution"],
-                    "status": stat,
-                    "details": details,
-                }
-            )
-    # Calculate pass@k.
-    total = np.array([len(r) for k, r in results["eval"].items() if k in problems])
-    base_correct = []
-    for key, res in results["eval"].items():
-        if key not in problems:
-            continue
-        bc = sum([r["status"] == PASS for r in res])
-        base_correct.append(bc)
-    base_correct = np.array(base_correct)
-    pass_at_k = {
-        f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean()
-        for k in pass_k
-        if total.min() >= k
-    }
     pass_at_k["model"] = os.path.basename(samples).split("--bigcodebench-")[0]
     pass_at_k["split"] = split
     pass_at_k["subset"] = subset
     pass_at_k["calibrated"] = "sanitized-calibrated" in samples
     pass_at_k["gt_pass_rate"] = gt_pass_rate
     pass_at_k["failed_tasks"] = failed_tasks
     return results, pass_at_k
@@ -252,8 +249,8 @@ def run_gradio():
     interface = gr.Interface(
         fn=evaluate,
         inputs=[
-            gr.Dropdown(["complete", "instruct"], label="Split"),
-            gr.Dropdown(["full", "hard"], label="Subset"),
             gr.File(label="Samples Path (.jsonl)"),
             gr.Textbox(label="Pass k Values (comma-separated)", value="1,5,10"),
             gr.Slider(1, multiprocessing.cpu_count(), step=1, label="Parallel Workers"),

     gt_pass_rate = np.mean([1 if v is not None else 0 for k, v in expected_time.items() if k in problems])
     failed_tasks = [k for k, v in expected_time.items() if v is None and k in problems]
+    pass_at_k = dict()
+    if not check_gt_only:
+        results = {
+            "date": datetime.now().strftime("%Y-%m-%d %H:%M"),
+            "eval": {},
+        }
+        with ProcessPoolExecutor(max_workers=n_workers) as executor:
+            futures = []
+            completion_id = Counter()
+            n_samples = 0
+            eval_results = defaultdict(list)  # task_id ->
+            remainings = set()
+            print("Reading samples...")
+            for sample in tqdm(load_solutions(samples)):
+                task_id = sample["task_id"]
+                if task_id not in problems:
+                    warn(
+                        f"Task {task_id} is found in the samples but not found in the dataset"
+                    )
+                    continue
+                solution = (
+                    sample["solution"]
+                    if "solution" in sample
+                    else problems[task_id]["complete_prompt"] + sample["completion"]
                 )
+                if "sanitized-calibrated" in samples:
+                    solution = problems[task_id]["code_prompt"] + "\n    pass\n" + solution
+                remainings.add(sample["_identifier"])
+                args = (
+                    completion_id[task_id],
+                    problems[task_id],
+                    solution,
+                    max_as_limit,
+                    max_data_limit,
+                    max_stack_limit,
+                    sample["_identifier"],
+                    min_time_limit,
+                    expected_time[task_id] if expected_time[task_id] else 20
+                )
+                futures.append(executor.submit(check_correctness, *args))
+                completion_id[task_id] += 1
+                n_samples += 1
+            assert n_samples == len(remainings), "Missing problems in unfinished"
+            assert len(completion_id) == len(problems), "Missing problems in samples"
+            # def stucking_checker():
+            #     not_done = futures
+            #     while len(not_done) > 0:
+            #         done, not_done = wait(not_done, timeout=240, return_when=FIRST_COMPLETED)
+            #         if len(done) == 0:
+            #             warn("No samples have finished testing in the last 240s")
+            #             warn(f"{len(remainings)} samples to be tested: {remainings}")
+            # threading.Thread(target=stucking_checker).start()
+            for future in tqdm(as_completed(futures), total=n_samples):
+                result = future.result()
+                remainings.remove(result["_identifier"])
+                eval_results[result["task_id"]].append(result)
+        # sort the results for each problem by completion_id
+        for task_id, task_results in eval_results.items():
+            task_results.sort(key=lambda x: x["completion_id"])
+            results["eval"][task_id] = []
+            for res in task_results:
+                stat, details = res["base"]
+                results["eval"][task_id].append(
+                    {
+                        "task_id": task_id,
+                        "solution": res["solution"],
+                        "status": stat,
+                        "details": details,
+                    }
+                )
+        # Calculate pass@k.
+        total = np.array([len(r) for k, r in results["eval"].items() if k in problems])
+        base_correct = []
+        for key, res in results["eval"].items():
+            if key not in problems:
+                continue
+            bc = sum([r["status"] == PASS for r in res])
+            base_correct.append(bc)
+        base_correct = np.array(base_correct)
+        pass_at_k.update({
+            f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean()
+            for k in pass_k
+            if total.min() >= k
+        })
     pass_at_k["model"] = os.path.basename(samples).split("--bigcodebench-")[0]
     pass_at_k["split"] = split
     pass_at_k["subset"] = subset
     pass_at_k["calibrated"] = "sanitized-calibrated" in samples
     pass_at_k["gt_pass_rate"] = gt_pass_rate
     pass_at_k["failed_tasks"] = failed_tasks
     return results, pass_at_k
     interface = gr.Interface(
         fn=evaluate,
         inputs=[
+            gr.Dropdown(["complete", "instruct"], label="BigCodeBench Split"),
+            gr.Dropdown(["full", "hard"], label="BigCodeBench Subset"),
             gr.File(label="Samples Path (.jsonl)"),
             gr.Textbox(label="Pass k Values (comma-separated)", value="1,5,10"),
             gr.Slider(1, multiprocessing.cpu_count(), step=1, label="Parallel Workers"),