Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
meg-huggingface
commited on
Commit
·
67a80c3
1
Parent(s):
5506f29
Updates for newest lm_eval version, 0.4.3
Browse files- app.py +2 -2
- main_backend_harness.py +1 -2
- requirements.txt +2 -2
- src/backend/run_eval_suite_harness.py +12 -9
app.py
CHANGED
|
@@ -8,11 +8,11 @@ configure_root_logger()
|
|
| 8 |
from functools import partial
|
| 9 |
|
| 10 |
import gradio as gr
|
| 11 |
-
from main_backend_lighteval import run_auto_eval
|
| 12 |
-
# from main_backend_harness import run_auto_eval
|
| 13 |
from src.display.log_visualizer import log_file_to_html_string
|
| 14 |
from src.display.css_html_js import dark_mode_gradio_js
|
| 15 |
from src.envs import REFRESH_RATE, REPO_ID, QUEUE_REPO, RESULTS_REPO
|
|
|
|
|
|
|
| 16 |
from src.logging import setup_logger, log_file
|
| 17 |
|
| 18 |
logging.basicConfig(level=logging.INFO)
|
|
|
|
| 8 |
from functools import partial
|
| 9 |
|
| 10 |
import gradio as gr
|
|
|
|
|
|
|
| 11 |
from src.display.log_visualizer import log_file_to_html_string
|
| 12 |
from src.display.css_html_js import dark_mode_gradio_js
|
| 13 |
from src.envs import REFRESH_RATE, REPO_ID, QUEUE_REPO, RESULTS_REPO
|
| 14 |
+
# from main_backend_lighteval import run_auto_eval
|
| 15 |
+
from main_backend_harness import run_auto_eval
|
| 16 |
from src.logging import setup_logger, log_file
|
| 17 |
|
| 18 |
logging.basicConfig(level=logging.INFO)
|
main_backend_harness.py
CHANGED
|
@@ -70,9 +70,8 @@ def run_auto_eval():
|
|
| 70 |
num_fewshot=NUM_FEWSHOT,
|
| 71 |
local_dir=EVAL_RESULTS_PATH_BACKEND,
|
| 72 |
results_repo=RESULTS_REPO,
|
| 73 |
-
batch_size=
|
| 74 |
device=DEVICE,
|
| 75 |
-
no_cache=True,
|
| 76 |
limit=LIMIT
|
| 77 |
)
|
| 78 |
|
|
|
|
| 70 |
num_fewshot=NUM_FEWSHOT,
|
| 71 |
local_dir=EVAL_RESULTS_PATH_BACKEND,
|
| 72 |
results_repo=RESULTS_REPO,
|
| 73 |
+
batch_size="auto",
|
| 74 |
device=DEVICE,
|
|
|
|
| 75 |
limit=LIMIT
|
| 76 |
)
|
| 77 |
|
requirements.txt
CHANGED
|
@@ -5,12 +5,12 @@ huggingface-hub>=0.18.0
|
|
| 5 |
python-dateutil==2.8.2
|
| 6 |
requests==2.28.2
|
| 7 |
tqdm==4.65.0
|
| 8 |
-
accelerate
|
| 9 |
sentencepiece
|
| 10 |
|
| 11 |
# Evaluation suites
|
| 12 |
lighteval
|
| 13 |
-
lm_eval
|
| 14 |
|
| 15 |
# Log Visualizer
|
| 16 |
BeautifulSoup4==4.12.2
|
|
|
|
| 5 |
python-dateutil==2.8.2
|
| 6 |
requests==2.28.2
|
| 7 |
tqdm==4.65.0
|
| 8 |
+
accelerate>=0.26.0
|
| 9 |
sentencepiece
|
| 10 |
|
| 11 |
# Evaluation suites
|
| 12 |
lighteval
|
| 13 |
+
lm_eval==0.4.3
|
| 14 |
|
| 15 |
# Log Visualizer
|
| 16 |
BeautifulSoup4==4.12.2
|
src/backend/run_eval_suite_harness.py
CHANGED
|
@@ -4,26 +4,29 @@ import logging
|
|
| 4 |
from datetime import datetime
|
| 5 |
|
| 6 |
from lm_eval import tasks, evaluator, utils
|
|
|
|
| 7 |
|
| 8 |
from src.envs import RESULTS_REPO, API
|
| 9 |
from src.backend.manage_requests import EvalRequest
|
| 10 |
from src.logging import setup_logger
|
| 11 |
|
|
|
|
|
|
|
| 12 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
| 13 |
logger = setup_logger(__name__)
|
| 14 |
|
| 15 |
-
def run_evaluation(eval_request: EvalRequest, task_names: list, num_fewshot: int, batch_size: int, device: str, local_dir: str, results_repo: str, no_cache: bool =True, limit: int =None):
|
| 16 |
"""Runs one evaluation for the current evaluation request file, then pushes the results to the hub.
|
| 17 |
|
| 18 |
Args:
|
| 19 |
eval_request (EvalRequest): Input evaluation request file representation
|
| 20 |
task_names (list): Tasks to launch
|
| 21 |
num_fewshot (int): Number of few shots to use
|
| 22 |
-
batch_size (int): Selected batch size
|
| 23 |
-
device (str): "cpu" or "
|
| 24 |
local_dir (str): Where to save the results locally
|
| 25 |
results_repo (str): To which repository to upload the results
|
| 26 |
-
no_cache (bool, optional): Whether to use a cache or not
|
| 27 |
limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
|
| 28 |
|
| 29 |
Returns:
|
|
@@ -34,21 +37,21 @@ def run_evaluation(eval_request: EvalRequest, task_names: list, num_fewshot: int
|
|
| 34 |
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
|
| 35 |
)
|
| 36 |
|
| 37 |
-
|
|
|
|
|
|
|
| 38 |
|
| 39 |
logger.info(f"Selected Tasks: {task_names}")
|
| 40 |
|
| 41 |
results = evaluator.simple_evaluate(
|
| 42 |
-
model="hf
|
| 43 |
model_args=eval_request.get_model_args(),
|
| 44 |
tasks=task_names,
|
| 45 |
num_fewshot=num_fewshot,
|
| 46 |
batch_size=batch_size,
|
| 47 |
device=device,
|
| 48 |
-
no_cache=no_cache,
|
| 49 |
limit=limit,
|
| 50 |
-
write_out=True,
|
| 51 |
-
output_base_path="logs"
|
| 52 |
)
|
| 53 |
|
| 54 |
results["config"]["model_dtype"] = eval_request.precision
|
|
|
|
| 4 |
from datetime import datetime
|
| 5 |
|
| 6 |
from lm_eval import tasks, evaluator, utils
|
| 7 |
+
from lm_eval.tasks import TaskManager
|
| 8 |
|
| 9 |
from src.envs import RESULTS_REPO, API
|
| 10 |
from src.backend.manage_requests import EvalRequest
|
| 11 |
from src.logging import setup_logger
|
| 12 |
|
| 13 |
+
from typing import Union
|
| 14 |
+
|
| 15 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
| 16 |
logger = setup_logger(__name__)
|
| 17 |
|
| 18 |
+
def run_evaluation(eval_request: EvalRequest, task_names: list, num_fewshot: int, batch_size: Union[int, str], device: str, local_dir: str, results_repo: str, no_cache: bool =True, limit: int =None):
|
| 19 |
"""Runs one evaluation for the current evaluation request file, then pushes the results to the hub.
|
| 20 |
|
| 21 |
Args:
|
| 22 |
eval_request (EvalRequest): Input evaluation request file representation
|
| 23 |
task_names (list): Tasks to launch
|
| 24 |
num_fewshot (int): Number of few shots to use
|
| 25 |
+
batch_size (int or str): Selected batch size or 'auto'
|
| 26 |
+
device (str): "cpu" or "cuda:0", depending on what you assigned to the space
|
| 27 |
local_dir (str): Where to save the results locally
|
| 28 |
results_repo (str): To which repository to upload the results
|
| 29 |
+
no_cache (bool, optional): Whether to use a cache or not
|
| 30 |
limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
|
| 31 |
|
| 32 |
Returns:
|
|
|
|
| 37 |
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
|
| 38 |
)
|
| 39 |
|
| 40 |
+
task_manager = TaskManager()
|
| 41 |
+
all_tasks = task_manager.all_tasks
|
| 42 |
+
task_names = utils.pattern_match(task_names, all_tasks)
|
| 43 |
|
| 44 |
logger.info(f"Selected Tasks: {task_names}")
|
| 45 |
|
| 46 |
results = evaluator.simple_evaluate(
|
| 47 |
+
model="hf",
|
| 48 |
model_args=eval_request.get_model_args(),
|
| 49 |
tasks=task_names,
|
| 50 |
num_fewshot=num_fewshot,
|
| 51 |
batch_size=batch_size,
|
| 52 |
device=device,
|
|
|
|
| 53 |
limit=limit,
|
| 54 |
+
write_out=True # Whether to write out an example document and model input, for checking task integrity
|
|
|
|
| 55 |
)
|
| 56 |
|
| 57 |
results["config"]["model_dtype"] = eval_request.precision
|