Spaces:
Runtime error
Runtime error
jinsol-neubla
commited on
Commit
·
9a04f8c
1
Parent(s):
3066149
Fix GSM8k key change issue
Browse files(get-answer -> strict-match)
Signed-off-by: jinsol-neubla <jinsol.kim@neubla.com>
- app.py +18 -18
- requirements.txt +3 -3
- src/display/utils.py +15 -4
- src/leaderboard/read_evals.py +6 -3
app.py
CHANGED
|
@@ -80,7 +80,7 @@ leaderboard_df, original_df, plot_df = init_space()
|
|
| 80 |
def update_table(
|
| 81 |
hidden_df: pd.DataFrame,
|
| 82 |
columns: list,
|
| 83 |
-
type_query: list,
|
| 84 |
weight_precision_query: str,
|
| 85 |
activation_precision_query: str,
|
| 86 |
size_query: list,
|
|
@@ -90,7 +90,7 @@ def update_table(
|
|
| 90 |
):
|
| 91 |
filtered_df = filter_models(
|
| 92 |
df=hidden_df,
|
| 93 |
-
type_query=type_query,
|
| 94 |
size_query=size_query,
|
| 95 |
weight_precision_query=weight_precision_query,
|
| 96 |
activation_precision_query=activation_precision_query,
|
|
@@ -151,7 +151,7 @@ def filter_queries(query: str, filtered_df: pd.DataFrame):
|
|
| 151 |
|
| 152 |
def filter_models(
|
| 153 |
df: pd.DataFrame,
|
| 154 |
-
type_query: list,
|
| 155 |
size_query: list,
|
| 156 |
weight_precision_query: list,
|
| 157 |
activation_precision_query: list,
|
|
@@ -173,8 +173,8 @@ def filter_models(
|
|
| 173 |
if "Flagged" in hide_models:
|
| 174 |
filtered_df = filtered_df[filtered_df[AutoEvalColumn.flagged.name] == False]
|
| 175 |
|
| 176 |
-
type_emoji = [t[0] for t in type_query]
|
| 177 |
-
filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
| 178 |
filtered_df = filtered_df.loc[df[AutoEvalColumn.weight_precision.name].isin(weight_precision_query + ["None"])]
|
| 179 |
filtered_df = filtered_df.loc[
|
| 180 |
df[AutoEvalColumn.activation_precision.name].isin(activation_precision_query + ["None"])
|
|
@@ -191,7 +191,7 @@ def filter_models(
|
|
| 191 |
|
| 192 |
leaderboard_df = filter_models(
|
| 193 |
df=leaderboard_df,
|
| 194 |
-
type_query=[t.to_str(" : ") for t in ModelType],
|
| 195 |
size_query=list(NUMERIC_INTERVALS.keys()),
|
| 196 |
weight_precision_query=[i.value.name for i in Precision],
|
| 197 |
activation_precision_query=[i.value.name for i in Precision],
|
|
@@ -239,13 +239,13 @@ with demo:
|
|
| 239 |
)
|
| 240 |
with gr.Column(min_width=320):
|
| 241 |
# with gr.Box(elem_id="box-filter"):
|
| 242 |
-
filter_columns_type = gr.CheckboxGroup(
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
)
|
| 249 |
filter_columns_weight_precision = gr.CheckboxGroup(
|
| 250 |
label="Weight Precision",
|
| 251 |
choices=[i.value.name for i in Precision],
|
|
@@ -301,7 +301,7 @@ with demo:
|
|
| 301 |
[
|
| 302 |
hidden_leaderboard_table_for_search,
|
| 303 |
shown_columns,
|
| 304 |
-
filter_columns_type,
|
| 305 |
filter_columns_weight_precision,
|
| 306 |
filter_columns_activation_precision,
|
| 307 |
filter_columns_size,
|
|
@@ -319,7 +319,7 @@ with demo:
|
|
| 319 |
[
|
| 320 |
hidden_leaderboard_table_for_search,
|
| 321 |
shown_columns,
|
| 322 |
-
filter_columns_type,
|
| 323 |
filter_columns_weight_precision,
|
| 324 |
filter_columns_activation_precision,
|
| 325 |
filter_columns_size,
|
|
@@ -334,7 +334,7 @@ with demo:
|
|
| 334 |
|
| 335 |
for selector in [
|
| 336 |
shown_columns,
|
| 337 |
-
filter_columns_type,
|
| 338 |
filter_columns_weight_precision,
|
| 339 |
filter_columns_activation_precision,
|
| 340 |
filter_columns_size,
|
|
@@ -346,7 +346,7 @@ with demo:
|
|
| 346 |
[
|
| 347 |
hidden_leaderboard_table_for_search,
|
| 348 |
shown_columns,
|
| 349 |
-
filter_columns_type,
|
| 350 |
filter_columns_weight_precision,
|
| 351 |
filter_columns_activation_precision,
|
| 352 |
filter_columns_size,
|
|
@@ -391,4 +391,4 @@ scheduler = BackgroundScheduler()
|
|
| 391 |
scheduler.add_job(restart_space, "interval", seconds=1800) # restarted every 3h
|
| 392 |
scheduler.start()
|
| 393 |
|
| 394 |
-
demo.queue(default_concurrency_limit=40).launch(
|
|
|
|
| 80 |
def update_table(
|
| 81 |
hidden_df: pd.DataFrame,
|
| 82 |
columns: list,
|
| 83 |
+
# type_query: list,
|
| 84 |
weight_precision_query: str,
|
| 85 |
activation_precision_query: str,
|
| 86 |
size_query: list,
|
|
|
|
| 90 |
):
|
| 91 |
filtered_df = filter_models(
|
| 92 |
df=hidden_df,
|
| 93 |
+
# type_query=type_query,
|
| 94 |
size_query=size_query,
|
| 95 |
weight_precision_query=weight_precision_query,
|
| 96 |
activation_precision_query=activation_precision_query,
|
|
|
|
| 151 |
|
| 152 |
def filter_models(
|
| 153 |
df: pd.DataFrame,
|
| 154 |
+
# type_query: list,
|
| 155 |
size_query: list,
|
| 156 |
weight_precision_query: list,
|
| 157 |
activation_precision_query: list,
|
|
|
|
| 173 |
if "Flagged" in hide_models:
|
| 174 |
filtered_df = filtered_df[filtered_df[AutoEvalColumn.flagged.name] == False]
|
| 175 |
|
| 176 |
+
# type_emoji = [t[0] for t in type_query]
|
| 177 |
+
# filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
| 178 |
filtered_df = filtered_df.loc[df[AutoEvalColumn.weight_precision.name].isin(weight_precision_query + ["None"])]
|
| 179 |
filtered_df = filtered_df.loc[
|
| 180 |
df[AutoEvalColumn.activation_precision.name].isin(activation_precision_query + ["None"])
|
|
|
|
| 191 |
|
| 192 |
leaderboard_df = filter_models(
|
| 193 |
df=leaderboard_df,
|
| 194 |
+
# type_query=[t.to_str(" : ") for t in ModelType],
|
| 195 |
size_query=list(NUMERIC_INTERVALS.keys()),
|
| 196 |
weight_precision_query=[i.value.name for i in Precision],
|
| 197 |
activation_precision_query=[i.value.name for i in Precision],
|
|
|
|
| 239 |
)
|
| 240 |
with gr.Column(min_width=320):
|
| 241 |
# with gr.Box(elem_id="box-filter"):
|
| 242 |
+
# filter_columns_type = gr.CheckboxGroup(
|
| 243 |
+
# label="Model types",
|
| 244 |
+
# choices=[t.to_str() for t in ModelType],
|
| 245 |
+
# value=[t.to_str() for t in ModelType],
|
| 246 |
+
# interactive=True,
|
| 247 |
+
# elem_id="filter-columns-type",
|
| 248 |
+
# )
|
| 249 |
filter_columns_weight_precision = gr.CheckboxGroup(
|
| 250 |
label="Weight Precision",
|
| 251 |
choices=[i.value.name for i in Precision],
|
|
|
|
| 301 |
[
|
| 302 |
hidden_leaderboard_table_for_search,
|
| 303 |
shown_columns,
|
| 304 |
+
# filter_columns_type,
|
| 305 |
filter_columns_weight_precision,
|
| 306 |
filter_columns_activation_precision,
|
| 307 |
filter_columns_size,
|
|
|
|
| 319 |
[
|
| 320 |
hidden_leaderboard_table_for_search,
|
| 321 |
shown_columns,
|
| 322 |
+
# filter_columns_type,
|
| 323 |
filter_columns_weight_precision,
|
| 324 |
filter_columns_activation_precision,
|
| 325 |
filter_columns_size,
|
|
|
|
| 334 |
|
| 335 |
for selector in [
|
| 336 |
shown_columns,
|
| 337 |
+
# filter_columns_type,
|
| 338 |
filter_columns_weight_precision,
|
| 339 |
filter_columns_activation_precision,
|
| 340 |
filter_columns_size,
|
|
|
|
| 346 |
[
|
| 347 |
hidden_leaderboard_table_for_search,
|
| 348 |
shown_columns,
|
| 349 |
+
# filter_columns_type,
|
| 350 |
filter_columns_weight_precision,
|
| 351 |
filter_columns_activation_precision,
|
| 352 |
filter_columns_size,
|
|
|
|
| 391 |
scheduler.add_job(restart_space, "interval", seconds=1800) # restarted every 3h
|
| 392 |
scheduler.start()
|
| 393 |
|
| 394 |
+
demo.queue(default_concurrency_limit=40).launch()
|
requirements.txt
CHANGED
|
@@ -2,15 +2,15 @@ APScheduler==3.10.1
|
|
| 2 |
black==23.11.0
|
| 3 |
click==8.1.3
|
| 4 |
datasets==2.14.5
|
| 5 |
-
gradio==4.
|
| 6 |
-
gradio_client
|
| 7 |
huggingface-hub>=0.18.0
|
| 8 |
matplotlib==3.7.1
|
| 9 |
numpy==1.24.2
|
| 10 |
pandas==2.0.0
|
| 11 |
plotly==5.14.1
|
| 12 |
python-dateutil==2.8.2
|
| 13 |
-
requests
|
| 14 |
sentencepiece
|
| 15 |
tqdm==4.65.0
|
| 16 |
transformers==4.37.0
|
|
|
|
| 2 |
black==23.11.0
|
| 3 |
click==8.1.3
|
| 4 |
datasets==2.14.5
|
| 5 |
+
gradio==4.29.0
|
| 6 |
+
gradio_client
|
| 7 |
huggingface-hub>=0.18.0
|
| 8 |
matplotlib==3.7.1
|
| 9 |
numpy==1.24.2
|
| 10 |
pandas==2.0.0
|
| 11 |
plotly==5.14.1
|
| 12 |
python-dateutil==2.8.2
|
| 13 |
+
requests
|
| 14 |
sentencepiece
|
| 15 |
tqdm==4.65.0
|
| 16 |
transformers==4.37.0
|
src/display/utils.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
from dataclasses import dataclass, make_dataclass
|
| 2 |
from enum import Enum
|
| 3 |
from altair import Column
|
|
|
|
| 4 |
|
| 5 |
import pandas as pd
|
| 6 |
|
|
@@ -12,7 +13,7 @@ def fields(raw_class):
|
|
| 12 |
@dataclass
|
| 13 |
class Task:
|
| 14 |
benchmark: str
|
| 15 |
-
metric: str
|
| 16 |
col_name: str
|
| 17 |
|
| 18 |
|
|
@@ -22,7 +23,17 @@ class Tasks(Enum):
|
|
| 22 |
mmlu = Task("mmlu", "acc", "MMLU")
|
| 23 |
truthfulqa = Task("truthfulqa_mc2", "acc", "TruthfulQA")
|
| 24 |
winogrande = Task("winogrande", "acc", "Winogrande")
|
| 25 |
-
gsm8k = Task("gsm8k", "exact_match,get-answer", "GSM8K")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
# These classes are for user facing column names,
|
|
@@ -40,7 +51,7 @@ class ColumnContent:
|
|
| 40 |
|
| 41 |
auto_eval_column_dict = []
|
| 42 |
# Init
|
| 43 |
-
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 44 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 45 |
# Scores
|
| 46 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
|
@@ -173,7 +184,7 @@ class Precision(Enum):
|
|
| 173 |
Unknown = ModelDetails("?")
|
| 174 |
|
| 175 |
def from_str(precision):
|
| 176 |
-
if precision in ["torch.float16", "float16"]:
|
| 177 |
return Precision.float16
|
| 178 |
if precision in ["torch.bfloat16", "bfloat16"]:
|
| 179 |
return Precision.bfloat16
|
|
|
|
| 1 |
from dataclasses import dataclass, make_dataclass
|
| 2 |
from enum import Enum
|
| 3 |
from altair import Column
|
| 4 |
+
from typing import Union, List, Dict
|
| 5 |
|
| 6 |
import pandas as pd
|
| 7 |
|
|
|
|
| 13 |
@dataclass
|
| 14 |
class Task:
|
| 15 |
benchmark: str
|
| 16 |
+
metric: Union[str, List[str]]
|
| 17 |
col_name: str
|
| 18 |
|
| 19 |
|
|
|
|
| 23 |
mmlu = Task("mmlu", "acc", "MMLU")
|
| 24 |
truthfulqa = Task("truthfulqa_mc2", "acc", "TruthfulQA")
|
| 25 |
winogrande = Task("winogrande", "acc", "Winogrande")
|
| 26 |
+
gsm8k = Task("gsm8k", ["exact_match,get-answer", "exact_match,strict-match"], "GSM8K")
|
| 27 |
+
|
| 28 |
+
@staticmethod
|
| 29 |
+
def get_metric(task: Task, dict_results: Dict[str, float]):
|
| 30 |
+
if isinstance(task.metric, str):
|
| 31 |
+
return dict_results[task.metric]
|
| 32 |
+
else:
|
| 33 |
+
for metric in task.metric:
|
| 34 |
+
if metric in dict_results:
|
| 35 |
+
return dict_results[metric]
|
| 36 |
+
return None
|
| 37 |
|
| 38 |
|
| 39 |
# These classes are for user facing column names,
|
|
|
|
| 51 |
|
| 52 |
auto_eval_column_dict = []
|
| 53 |
# Init
|
| 54 |
+
# auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 55 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 56 |
# Scores
|
| 57 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
|
|
|
| 184 |
Unknown = ModelDetails("?")
|
| 185 |
|
| 186 |
def from_str(precision):
|
| 187 |
+
if precision in ["torch.float16", "float16", "fp16"]:
|
| 188 |
return Precision.float16
|
| 189 |
if precision in ["torch.bfloat16", "bfloat16"]:
|
| 190 |
return Precision.bfloat16
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -94,7 +94,7 @@ class EvalResult:
|
|
| 94 |
if task.benchmark == "mmlu":
|
| 95 |
accs = np.array([data["results"].get(task.benchmark, {}).get(task.metric, None)])
|
| 96 |
else:
|
| 97 |
-
accs = np.array([
|
| 98 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
| 99 |
continue
|
| 100 |
|
|
@@ -154,7 +154,7 @@ class EvalResult:
|
|
| 154 |
AutoEvalColumn.weight_precision.name: self.weight_precision.value.name,
|
| 155 |
AutoEvalColumn.activation_precision.name: self.activation_precision.value.name,
|
| 156 |
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
| 157 |
-
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
| 158 |
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
| 159 |
AutoEvalColumn.architecture.name: self.architecture,
|
| 160 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
|
@@ -216,6 +216,7 @@ def get_raw_eval_results(results_path: str) -> list[EvalResult]:
|
|
| 216 |
|
| 217 |
eval_results = {}
|
| 218 |
for model_result_filepath in model_result_filepaths:
|
|
|
|
| 219 |
# Creation of result
|
| 220 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
| 221 |
|
|
@@ -232,7 +233,9 @@ def get_raw_eval_results(results_path: str) -> list[EvalResult]:
|
|
| 232 |
if v.status == "FINISHED":
|
| 233 |
v.to_dict() # we test if the dict version is complete
|
| 234 |
results.append(v)
|
| 235 |
-
except KeyError: # not all eval values present
|
|
|
|
|
|
|
| 236 |
continue
|
| 237 |
|
| 238 |
return results
|
|
|
|
| 94 |
if task.benchmark == "mmlu":
|
| 95 |
accs = np.array([data["results"].get(task.benchmark, {}).get(task.metric, None)])
|
| 96 |
else:
|
| 97 |
+
accs = np.array([Tasks.get_metric(task, v) for k, v in data["results"].items() if task.benchmark in k])
|
| 98 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
| 99 |
continue
|
| 100 |
|
|
|
|
| 154 |
AutoEvalColumn.weight_precision.name: self.weight_precision.value.name,
|
| 155 |
AutoEvalColumn.activation_precision.name: self.activation_precision.value.name,
|
| 156 |
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
| 157 |
+
# AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
| 158 |
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
| 159 |
AutoEvalColumn.architecture.name: self.architecture,
|
| 160 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
|
|
|
| 216 |
|
| 217 |
eval_results = {}
|
| 218 |
for model_result_filepath in model_result_filepaths:
|
| 219 |
+
print(f"Read {model_result_filepath}")
|
| 220 |
# Creation of result
|
| 221 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
| 222 |
|
|
|
|
| 233 |
if v.status == "FINISHED":
|
| 234 |
v.to_dict() # we test if the dict version is complete
|
| 235 |
results.append(v)
|
| 236 |
+
except KeyError as e: # not all eval values present
|
| 237 |
+
print(f"Fail to get results from {v.eval_name} with the error {e}")
|
| 238 |
+
print(v)
|
| 239 |
continue
|
| 240 |
|
| 241 |
return results
|