Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import json | |
| import logging | |
| import multiprocessing | |
| import os | |
| import pickle | |
| import threading | |
| import time | |
| from collections import Counter, defaultdict | |
| from concurrent.futures import ProcessPoolExecutor, as_completed, wait, FIRST_COMPLETED | |
| from datetime import datetime | |
| from typing import Any, Dict, List, Tuple | |
| from warnings import warn | |
| import gc | |
| import numpy as np | |
| from huggingface_hub import HfApi | |
| from bigcodebench.data import get_bigcodebench, get_bigcodebench_hash, load_solutions | |
| from bigcodebench.data.utils import CACHE_DIR | |
| from bigcodebench.eval import PASS, compatible_eval_result, estimate_pass_at_k, untrusted_check | |
| from bigcodebench.gen.util import trusted_check | |
| from apscheduler.schedulers.background import BackgroundScheduler | |
| REPO_ID = "bigcode/bigcodebench-evaluator" | |
| HF_TOKEN = os.environ.get("HF_TOKEN", None) | |
| API = HfApi(token=HF_TOKEN) | |
| Result = Tuple[str, List[bool]] | |
| def get_groundtruth(n_workers, problems, hashcode, check_gt_only, max_as_limit, max_data_limit, max_stack_limit, min_time_limit): | |
| cache_file = os.path.join(CACHE_DIR, f"{hashcode}.pkl") | |
| if os.path.exists(cache_file): | |
| with open(cache_file, "rb") as f: | |
| return pickle.load(f) | |
| os.makedirs(CACHE_DIR, exist_ok=True) | |
| tbegin = time.time() | |
| with ProcessPoolExecutor(max_workers=n_workers) as executor: | |
| futures = [] | |
| n_samples = 0 | |
| expected_time = dict() | |
| for problem in problems.values(): | |
| args = ( | |
| problem["complete_prompt"] + "\n" + problem["canonical_solution"], | |
| problem["test"], | |
| problem["task_id"], | |
| max_as_limit, | |
| max_data_limit, | |
| max_stack_limit, | |
| min_time_limit, | |
| ) | |
| futures.append(executor.submit(trusted_check, *args)) | |
| n_samples += 1 | |
| for future in as_completed(futures): | |
| result = future.result() | |
| expected_time[result["task_id"]] = result["time"] | |
| if any(expected_time.values()): | |
| with open(cache_file, "wb") as f: | |
| pickle.dump(expected_time, f) | |
| return expected_time | |
| def check_correctness( | |
| completion_id: int, | |
| problem: Dict[str, Any], | |
| solution: str, | |
| max_as_limit: float, | |
| max_data_limit: float, | |
| max_stack_limit: float, | |
| identifier=None, | |
| min_time_limit: float = 0.1, | |
| gt_time_limit: float = 2.0, | |
| ) -> Dict[str, Result]: | |
| ret = { | |
| "completion_id": completion_id, | |
| "task_id": problem["task_id"], | |
| "_identifier": identifier, | |
| "solution": solution, | |
| } | |
| ret["base"] = untrusted_check( | |
| solution, | |
| problem["test"], | |
| problem["entry_point"], | |
| max_as_limit, | |
| max_data_limit, | |
| max_stack_limit, | |
| min_time_limit, | |
| gt_time_limit, | |
| ) | |
| return ret | |
| def evaluate( | |
| split: str, | |
| subset: str, | |
| samples: str, | |
| pass_k: str="1,5,10", | |
| parallel: int = -1, | |
| min_time_limit: float = 1, | |
| max_as_limit: int = 30 * 1024, | |
| max_data_limit: int = 30 * 1024, | |
| max_stack_limit: int = 10, | |
| calibrated: bool = True, | |
| check_gt_only: bool = False, | |
| no_gt: bool = False, | |
| selective_evaluate: str = "", | |
| ): | |
| passk = [int(k.strip()) for k in pass_k.split(',') if k.strip().isdigit()] | |
| if parallel < 1: | |
| n_workers = max(1, multiprocessing.cpu_count() // 2) | |
| else: | |
| n_workers = parallel | |
| if check_gt_only: | |
| samples = "__dummy__.jsonl" | |
| extra = subset + "_" if subset != "full" else "" | |
| problems = get_bigcodebench(subset=subset) | |
| # Add selective evaluation logic | |
| if selective_evaluate: | |
| selected_ids = ["BigCodeBench/" + id for id in sorted(set(selective_evaluate.split(",")))] | |
| problems = {k: v for k, v in problems.items() if k in selected_ids} | |
| if not problems: | |
| raise ValueError(f"None of the provided task IDs {selected_ids} were found in the dataset") | |
| dataset_hash = get_bigcodebench_hash(subset=subset) | |
| if not no_gt: | |
| expected_time = get_groundtruth(n_workers, problems, dataset_hash, check_gt_only, max_as_limit, max_data_limit, max_stack_limit, min_time_limit) | |
| else: | |
| expected_time = {task_id: None for task_id in problems} | |
| gt_pass_rate = np.mean([1 if v is not None else 0 for k, v in expected_time.items() if k in problems]) | |
| failed_tasks = [k for k, v in expected_time.items() if v is None and k in problems] | |
| pass_at_k = dict() | |
| results = { | |
| "date": datetime.now().strftime("%Y-%m-%d %H:%M"), | |
| "eval": {}, | |
| } | |
| if not check_gt_only: | |
| with ProcessPoolExecutor(max_workers=n_workers) as executor: | |
| futures = [] | |
| completion_id = Counter() | |
| n_samples = 0 | |
| eval_results = defaultdict(list) # task_id -> | |
| remainings = set() | |
| for sample in load_solutions(samples): | |
| task_id = sample["task_id"] | |
| if task_id not in problems: | |
| continue | |
| solution = ( | |
| sample["solution"] | |
| if "solution" in sample | |
| else problems[task_id]["complete_prompt"] + sample["completion"] | |
| ) | |
| if calibrated: | |
| solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution | |
| remainings.add(sample["_identifier"]) | |
| args = ( | |
| completion_id[task_id], | |
| problems[task_id], | |
| solution, | |
| max_as_limit, | |
| max_data_limit, | |
| max_stack_limit, | |
| sample["_identifier"], | |
| min_time_limit, | |
| expected_time[task_id] if expected_time[task_id] else 20 | |
| ) | |
| futures.append(executor.submit(check_correctness, *args)) | |
| completion_id[task_id] += 1 | |
| n_samples += 1 | |
| assert n_samples == len(remainings), "Missing problems in unfinished" | |
| assert len(completion_id) == len(problems), "Missing problems in samples" | |
| for future in as_completed(futures): | |
| result = future.result() | |
| remainings.remove(result["_identifier"]) | |
| eval_results[result["task_id"]].append(result) | |
| del future, result | |
| gc.collect() | |
| # sort the results for each problem by completion_id | |
| for task_id, task_results in eval_results.items(): | |
| task_results.sort(key=lambda x: x["completion_id"]) | |
| results["eval"][task_id] = [] | |
| for res in task_results: | |
| stat, details = res["base"] | |
| results["eval"][task_id].append( | |
| { | |
| "task_id": task_id, | |
| "solution": res["solution"], | |
| "status": stat, | |
| "details": details, | |
| } | |
| ) | |
| # Calculate pass@k. | |
| total = np.array([len(r) for k, r in results["eval"].items() if k in problems]) | |
| base_correct = [] | |
| for key, res in results["eval"].items(): | |
| if key not in problems: | |
| continue | |
| bc = sum([r["status"] == PASS for r in res]) | |
| base_correct.append(bc) | |
| base_correct = np.array(base_correct) | |
| pass_at_k.update({ | |
| f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean() | |
| for k in passk | |
| if total.min() >= k | |
| }) | |
| del problems, futures | |
| gc.collect() | |
| pass_at_k["model"] = os.path.basename(samples).split("--bigcodebench-")[0] | |
| pass_at_k["split"] = split | |
| pass_at_k["subset"] = subset | |
| pass_at_k["calibrated"] = calibrated | |
| pass_at_k["gt_pass_rate"] = gt_pass_rate | |
| pass_at_k["failed_tasks"] = failed_tasks | |
| return results, pass_at_k | |
| # def run_gradio(): | |
| interface = gr.Interface( | |
| fn=evaluate, | |
| inputs=[ | |
| gr.Dropdown(["complete", "instruct"], label="BigCodeBench Split"), | |
| gr.Dropdown(["full", "hard"], label="BigCodeBench Subset"), | |
| gr.File(label="Samples Path (.jsonl)"), | |
| gr.Textbox(label="Pass k Values (comma-separated)", value="1,5,10"), | |
| gr.Slider(-1, multiprocessing.cpu_count(), step=1, label="Parallel Workers", value=-1), | |
| gr.Slider(0.1, 10, step=0.1, label="Min Time Limit", value=1), | |
| gr.Slider(1, 100 * 1024, step=1024, label="Max AS Limit", value=30 * 1024), | |
| gr.Slider(1, 100 * 1024, step=1024, label="Max Data Limit", value=30 * 1024), | |
| gr.Slider(1, 100, step=1, label="Max Stack Limit", value=10), | |
| gr.Checkbox(label="Calibrated", value=True), | |
| gr.Checkbox(label="Check GT Only"), | |
| gr.Checkbox(label="No GT"), | |
| gr.Textbox(label="Selective Evaluated Task IDs (comma-separated, e.g. '0,1,2')", value=""), | |
| ], | |
| outputs=[ | |
| gr.JSON(label="Results"), | |
| gr.JSON(label="Eval Results"), | |
| ], | |
| # concurrency_limit=None | |
| ) | |
| interface.queue(default_concurrency_limit=None) | |
| def preload_gt(): | |
| evaluate(split="complete", subset="full", samples="", check_gt_only=True) | |
| evaluate(split="complete", subset="hard", samples="", check_gt_only=True) | |
| def restart_space(): | |
| logging.info(f"Restarting space with repo ID: {REPO_ID}") | |
| try: | |
| # Now restart the space | |
| API.restart_space(repo_id=REPO_ID, token=HF_TOKEN) | |
| logging.info("Space restarted successfully.") | |
| except Exception as e: | |
| logging.error(f"Failed to restart space: {e}") | |
| # if __name__ == "__main__": | |
| preload_gt() | |
| scheduler = BackgroundScheduler() | |
| scheduler.add_job(restart_space, "interval", hours=2) # Restart every 2hs | |
| scheduler.start() | |
| interface.launch(show_error=True) | |