Spaces:
Runtime error
Runtime error
Commit
·
6b9a0ec
1
Parent(s):
84fc6ef
nathan-flagged-models-vis (#478)
Browse files- Adds a way to hide flagged models (a69dfa979897081c10a30f1be9937a917d93422b)
- remove unnused pprint import (1be35c2d1ffffab552d9d65f826930e4f9f1c273)
- remove unnused pprint import (6adc61160db982ce023039472b8842d21584b367)
(cherry picked from commit 460ecf2f9814163d447819d75dd51e4139b4476b)
- app.py +16 -7
- src/display/utils.py +2 -0
- src/leaderboard/filter_models.py +18 -2
app.py
CHANGED
|
@@ -33,7 +33,6 @@ from src.display.utils import (
|
|
| 33 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
|
| 34 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 35 |
from src.submission.submit import add_new_eval
|
| 36 |
-
from src.submission.check_validity import already_submitted_models
|
| 37 |
from src.tools.collections import update_collections
|
| 38 |
from src.tools.plots import (
|
| 39 |
create_metric_plot_obj,
|
|
@@ -82,14 +81,20 @@ def update_table(
|
|
| 82 |
precision_query: str,
|
| 83 |
size_query: list,
|
| 84 |
show_deleted: bool,
|
|
|
|
| 85 |
query: str,
|
| 86 |
):
|
| 87 |
-
filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
|
| 88 |
filtered_df = filter_queries(query, filtered_df)
|
| 89 |
df = select_columns(filtered_df, columns)
|
| 90 |
return df
|
| 91 |
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
| 94 |
return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
|
| 95 |
|
|
@@ -127,7 +132,7 @@ def filter_queries(query: str, filtered_df: pd.DataFrame):
|
|
| 127 |
|
| 128 |
|
| 129 |
def filter_models(
|
| 130 |
-
df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
|
| 131 |
) -> pd.DataFrame:
|
| 132 |
# Show all models
|
| 133 |
if show_deleted:
|
|
@@ -135,6 +140,9 @@ def filter_models(
|
|
| 135 |
else: # Show only still on the hub models
|
| 136 |
filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
|
| 137 |
|
|
|
|
|
|
|
|
|
|
| 138 |
type_emoji = [t[0] for t in type_query]
|
| 139 |
filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
| 140 |
filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
|
@@ -146,6 +154,7 @@ def filter_models(
|
|
| 146 |
|
| 147 |
return filtered_df
|
| 148 |
|
|
|
|
| 149 |
|
| 150 |
import unicodedata
|
| 151 |
|
|
@@ -175,11 +184,11 @@ hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
|
| 175 |
|
| 176 |
def display(x, y):
|
| 177 |
# Assuming df is your DataFrame
|
| 178 |
-
for column in
|
| 179 |
-
if
|
| 180 |
-
|
| 181 |
|
| 182 |
-
subset_df =
|
| 183 |
# Ensure the output directory exists
|
| 184 |
#output_dir = 'output'
|
| 185 |
#if not os.path.exists(output_dir):
|
|
|
|
| 33 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
|
| 34 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 35 |
from src.submission.submit import add_new_eval
|
|
|
|
| 36 |
from src.tools.collections import update_collections
|
| 37 |
from src.tools.plots import (
|
| 38 |
create_metric_plot_obj,
|
|
|
|
| 81 |
precision_query: str,
|
| 82 |
size_query: list,
|
| 83 |
show_deleted: bool,
|
| 84 |
+
show_flagged: bool,
|
| 85 |
query: str,
|
| 86 |
):
|
| 87 |
+
filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted, show_flagged)
|
| 88 |
filtered_df = filter_queries(query, filtered_df)
|
| 89 |
df = select_columns(filtered_df, columns)
|
| 90 |
return df
|
| 91 |
|
| 92 |
|
| 93 |
+
def load_query(request: gr.Request): # triggered only once at startup => read query parameter if it exists
|
| 94 |
+
query = request.query_params.get("query") or ""
|
| 95 |
+
return query, query # return one for the "search_bar", one for a hidden component that triggers a reload only if value has changed
|
| 96 |
+
|
| 97 |
+
|
| 98 |
def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
| 99 |
return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
|
| 100 |
|
|
|
|
| 132 |
|
| 133 |
|
| 134 |
def filter_models(
|
| 135 |
+
df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool, show_flagged: bool
|
| 136 |
) -> pd.DataFrame:
|
| 137 |
# Show all models
|
| 138 |
if show_deleted:
|
|
|
|
| 140 |
else: # Show only still on the hub models
|
| 141 |
filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
|
| 142 |
|
| 143 |
+
if not show_flagged:
|
| 144 |
+
filtered_df = filtered_df[filtered_df[AutoEvalColumn.flagged.name] == False]
|
| 145 |
+
|
| 146 |
type_emoji = [t[0] for t in type_query]
|
| 147 |
filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
| 148 |
filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
|
|
|
| 154 |
|
| 155 |
return filtered_df
|
| 156 |
|
| 157 |
+
leaderboard_df = filter_models(leaderboard_df, [t.to_str(" : ") for t in ModelType], list(NUMERIC_INTERVALS.keys()), [i.value.name for i in Precision], False, False)
|
| 158 |
|
| 159 |
import unicodedata
|
| 160 |
|
|
|
|
| 184 |
|
| 185 |
def display(x, y):
|
| 186 |
# Assuming df is your DataFrame
|
| 187 |
+
for column in leaderboard_df.columns:
|
| 188 |
+
if leaderboard_df[column].dtype == 'object':
|
| 189 |
+
leaderboard_df[column] = leaderboard_df[column].apply(remove_invalid_unicode)
|
| 190 |
|
| 191 |
+
subset_df = leaderboard_df[COLS]
|
| 192 |
# Ensure the output directory exists
|
| 193 |
#output_dir = 'output'
|
| 194 |
#if not os.path.exists(output_dir):
|
src/display/utils.py
CHANGED
|
@@ -51,6 +51,7 @@ auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B
|
|
| 51 |
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub", "number", False)])
|
| 52 |
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
| 53 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
|
|
|
| 54 |
# Dummy column for the search bar (hidden by the custom CSS)
|
| 55 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
| 56 |
|
|
@@ -80,6 +81,7 @@ baseline_row = {
|
|
| 80 |
AutoEvalColumn.gsm8k.name: 0.21,
|
| 81 |
AutoEvalColumn.dummy.name: "baseline",
|
| 82 |
AutoEvalColumn.model_type.name: "",
|
|
|
|
| 83 |
}
|
| 84 |
|
| 85 |
# Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
|
|
|
|
| 51 |
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub", "number", False)])
|
| 52 |
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
| 53 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
| 54 |
+
auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, False)])
|
| 55 |
# Dummy column for the search bar (hidden by the custom CSS)
|
| 56 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
| 57 |
|
|
|
|
| 81 |
AutoEvalColumn.gsm8k.name: 0.21,
|
| 82 |
AutoEvalColumn.dummy.name: "baseline",
|
| 83 |
AutoEvalColumn.model_type.name: "",
|
| 84 |
+
AutoEvalColumn.flagged.name: False,
|
| 85 |
}
|
| 86 |
|
| 87 |
# Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
|
src/leaderboard/filter_models.py
CHANGED
|
@@ -13,13 +13,26 @@ FLAGGED_MODELS = {
|
|
| 13 |
"AIDC-ai-business/Marcoroni-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
|
| 14 |
"AIDC-ai-business/Marcoroni-13B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
|
| 15 |
"AIDC-ai-business/Marcoroni-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
|
| 16 |
-
"
|
| 17 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
}
|
| 19 |
|
| 20 |
# Models which have been requested by orgs to not be submitted on the leaderboard
|
| 21 |
DO_NOT_SUBMIT_MODELS = [
|
| 22 |
"Voicelab/trurl-2-13b", # trained on MMLU
|
|
|
|
|
|
|
|
|
|
| 23 |
]
|
| 24 |
|
| 25 |
|
|
@@ -34,6 +47,9 @@ def flag_models(leaderboard_data: list[dict]):
|
|
| 34 |
model_data[
|
| 35 |
AutoEvalColumn.model.name
|
| 36 |
] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
|
| 39 |
def remove_forbidden_models(leaderboard_data: list[dict]):
|
|
|
|
| 13 |
"AIDC-ai-business/Marcoroni-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
|
| 14 |
"AIDC-ai-business/Marcoroni-13B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
|
| 15 |
"AIDC-ai-business/Marcoroni-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
|
| 16 |
+
"fblgit/una-xaberius-34b-v1beta": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/444",
|
| 17 |
+
"jan-hq/trinity-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
|
| 18 |
+
"rwitz2/go-bruins-v2.1.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
|
| 19 |
+
"rwitz2/go-bruins-v2.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
|
| 20 |
+
"GreenNode/GreenNodeLM-v3olet-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
|
| 21 |
+
"GreenNode/GreenNodeLM-7B-v4leo": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
|
| 22 |
+
"GreenNode/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
|
| 23 |
+
"viethq188/LeoScorpius-7B-Chat-DPO": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
|
| 24 |
+
"GreenNode/GreenNodeLM-7B-v2leo": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
|
| 25 |
+
"janai-hq/trinity-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
|
| 26 |
+
"ignos/LeoScorpius-GreenNode-Alpaca-7B-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
|
| 27 |
+
"fblgit/una-cybertron-7b-v3-OMA": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
|
| 28 |
}
|
| 29 |
|
| 30 |
# Models which have been requested by orgs to not be submitted on the leaderboard
|
| 31 |
DO_NOT_SUBMIT_MODELS = [
|
| 32 |
"Voicelab/trurl-2-13b", # trained on MMLU
|
| 33 |
+
"TigerResearch/tigerbot-70b-chat", # per authors request
|
| 34 |
+
"TigerResearch/tigerbot-70b-chat-v2", # per authors request
|
| 35 |
+
"TigerResearch/tigerbot-70b-chat-v4-4k", # per authors request
|
| 36 |
]
|
| 37 |
|
| 38 |
|
|
|
|
| 47 |
model_data[
|
| 48 |
AutoEvalColumn.model.name
|
| 49 |
] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
|
| 50 |
+
model_data[AutoEvalColumn.flagged.name] = True
|
| 51 |
+
else:
|
| 52 |
+
model_data[AutoEvalColumn.flagged.name] = False
|
| 53 |
|
| 54 |
|
| 55 |
def remove_forbidden_models(leaderboard_data: list[dict]):
|