Separate zero-shot & few-shot results.
Browse files- app.py +9 -4
- src/about.py +3 -2
- src/display/utils.py +4 -3
- src/leaderboard/read_evals.py +3 -3
- src/populate.py +1 -0
app.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
| 3 |
import pandas as pd
|
|
@@ -48,7 +49,8 @@ except Exception:
|
|
| 48 |
restart_space()
|
| 49 |
|
| 50 |
|
| 51 |
-
|
|
|
|
| 52 |
|
| 53 |
(
|
| 54 |
finished_eval_queue_df,
|
|
@@ -56,14 +58,14 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
|
|
| 56 |
pending_eval_queue_df,
|
| 57 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 58 |
|
| 59 |
-
def init_leaderboard(dataframe):
|
| 60 |
if dataframe is None or dataframe.empty:
|
| 61 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 62 |
return Leaderboard(
|
| 63 |
value=dataframe,
|
| 64 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
| 65 |
select_columns=SelectColumns(
|
| 66 |
-
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
| 67 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
| 68 |
label="Select Columns to Display:",
|
| 69 |
),
|
|
@@ -104,7 +106,10 @@ with demo:
|
|
| 104 |
|
| 105 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 106 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
| 107 |
-
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 110 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
|
| 1 |
+
import os
|
| 2 |
import gradio as gr
|
| 3 |
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
| 4 |
import pandas as pd
|
|
|
|
| 49 |
restart_space()
|
| 50 |
|
| 51 |
|
| 52 |
+
LEADERBOARD_0_SHOT_DF = get_leaderboard_df(os.path.join(EVAL_RESULTS_PATH, "zero-shot"), EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 53 |
+
LEADERBOARD_1_SHOT_DF = get_leaderboard_df(os.path.join(EVAL_RESULTS_PATH, "few-shot"), EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 54 |
|
| 55 |
(
|
| 56 |
finished_eval_queue_df,
|
|
|
|
| 58 |
pending_eval_queue_df,
|
| 59 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 60 |
|
| 61 |
+
def init_leaderboard(dataframe, fewshot=True):
|
| 62 |
if dataframe is None or dataframe.empty:
|
| 63 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 64 |
return Leaderboard(
|
| 65 |
value=dataframe,
|
| 66 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
| 67 |
select_columns=SelectColumns(
|
| 68 |
+
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not (fewshot and c.hidden_in_fewshot)],
|
| 69 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
| 70 |
label="Select Columns to Display:",
|
| 71 |
),
|
|
|
|
| 106 |
|
| 107 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 108 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
| 109 |
+
with gr.TabItem("Zero-Shot", elem_id="zero-shot"):
|
| 110 |
+
leaderboard = init_leaderboard(LEADERBOARD_0_SHOT_DF, fewshot=False)
|
| 111 |
+
with gr.TabItem("Few-Shot", elem_id="few-shot"):
|
| 112 |
+
leaderboard = init_leaderboard(LEADERBOARD_1_SHOT_DF, fewshot=True)
|
| 113 |
|
| 114 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 115 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
src/about.py
CHANGED
|
@@ -23,18 +23,19 @@ class Task:
|
|
| 23 |
url: str
|
| 24 |
task_type: TaskType
|
| 25 |
is_primary_metric: bool = True
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
# Select your tasks here
|
| 29 |
# ---------------------------------------------------
|
| 30 |
class Tasks(Enum):
|
| 31 |
-
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 32 |
task0 = Task("sentiment_mlt", "f1", "Sentiment Analysis (F1)", "https://github.com/jerbarnes/typology_of_crosslingual/tree/master/data/sentiment/mt", TaskType.NLU)
|
| 33 |
task1 = Task("sib200_mlt", "f1", "SIB200 (F1)", "https://huggingface.co/datasets/Davlan/sib200/viewer/mlt_Latn", TaskType.NLU)
|
| 34 |
task2 = Task("taxi1500_mlt", "f1", "Taxi1500 (F1)", "https://github.com/cisnlp/Taxi1500", TaskType.NLU)
|
| 35 |
task3 = Task("maltese_news_categories", "loglikelihood", "Maltese News Categories (F1)", "https://huggingface.co/datasets/MLRS/maltese_news_categories", TaskType.NLU)
|
| 36 |
task4 = Task("multieurlex_mlt", "loglikelihood", "MultiEURLEX (F1)", "https://huggingface.co/datasets/nlpaueb/multi_eurlex", TaskType.NLU)
|
| 37 |
-
task5 = Task("belebele_mlt", "acc", "Belebele (Accuracy)", "https://huggingface.co/datasets/facebook/belebele/viewer/mlt_Latn", TaskType.NLU)
|
| 38 |
task6 = Task("opus100_eng-mlt", "bleu", "OPUS-100 EN→MT (BLEU)", "https://huggingface.co/datasets/MLRS/OPUS-MT-EN-Fixed", TaskType.NLG, False)
|
| 39 |
task7 = Task("opus100_eng-mlt", "chrf", "OPUS-100 EN→MT (ChrF)", "https://huggingface.co/datasets/MLRS/OPUS-MT-EN-Fixed", TaskType.NLG)
|
| 40 |
task8 = Task("flores200_eng-mlt", "bleu", "Flores-200 EN→MT (BLEU)", "https://huggingface.co/datasets/Muennighoff/flores200", TaskType.NLG, False)
|
|
|
|
| 23 |
url: str
|
| 24 |
task_type: TaskType
|
| 25 |
is_primary_metric: bool = True
|
| 26 |
+
zero_shot_only: bool = False
|
| 27 |
|
| 28 |
|
| 29 |
# Select your tasks here
|
| 30 |
# ---------------------------------------------------
|
| 31 |
class Tasks(Enum):
|
| 32 |
+
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 33 |
task0 = Task("sentiment_mlt", "f1", "Sentiment Analysis (F1)", "https://github.com/jerbarnes/typology_of_crosslingual/tree/master/data/sentiment/mt", TaskType.NLU)
|
| 34 |
task1 = Task("sib200_mlt", "f1", "SIB200 (F1)", "https://huggingface.co/datasets/Davlan/sib200/viewer/mlt_Latn", TaskType.NLU)
|
| 35 |
task2 = Task("taxi1500_mlt", "f1", "Taxi1500 (F1)", "https://github.com/cisnlp/Taxi1500", TaskType.NLU)
|
| 36 |
task3 = Task("maltese_news_categories", "loglikelihood", "Maltese News Categories (F1)", "https://huggingface.co/datasets/MLRS/maltese_news_categories", TaskType.NLU)
|
| 37 |
task4 = Task("multieurlex_mlt", "loglikelihood", "MultiEURLEX (F1)", "https://huggingface.co/datasets/nlpaueb/multi_eurlex", TaskType.NLU)
|
| 38 |
+
task5 = Task("belebele_mlt", "acc", "Belebele (Accuracy)", "https://huggingface.co/datasets/facebook/belebele/viewer/mlt_Latn", TaskType.NLU, zero_shot_only=True)
|
| 39 |
task6 = Task("opus100_eng-mlt", "bleu", "OPUS-100 EN→MT (BLEU)", "https://huggingface.co/datasets/MLRS/OPUS-MT-EN-Fixed", TaskType.NLG, False)
|
| 40 |
task7 = Task("opus100_eng-mlt", "chrf", "OPUS-100 EN→MT (ChrF)", "https://huggingface.co/datasets/MLRS/OPUS-MT-EN-Fixed", TaskType.NLG)
|
| 41 |
task8 = Task("flores200_eng-mlt", "bleu", "Flores-200 EN→MT (BLEU)", "https://huggingface.co/datasets/Muennighoff/flores200", TaskType.NLG, False)
|
src/display/utils.py
CHANGED
|
@@ -20,20 +20,21 @@ class ColumnContent:
|
|
| 20 |
displayed_by_default: bool
|
| 21 |
hidden: bool = False
|
| 22 |
never_hidden: bool = False
|
|
|
|
| 23 |
|
| 24 |
## Leaderboard columns
|
| 25 |
auto_eval_column_dict = []
|
| 26 |
# Init
|
| 27 |
auto_eval_column_dict.append(["model_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 28 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 29 |
-
auto_eval_column_dict.append(["n_shot", ColumnContent, ColumnContent("N-Shot", "number",
|
| 30 |
-
auto_eval_column_dict.append(["prompt_version", ColumnContent, ColumnContent("Version", "str",
|
| 31 |
#Scores
|
| 32 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average (All) ⬆️", "number", True)])
|
| 33 |
for task_type in TaskType:
|
| 34 |
auto_eval_column_dict.append([task_type.value.name, ColumnContent, ColumnContent(f"Average ({task_type.value.display_name}) {task_type.value.symbol}", "number", True)])
|
| 35 |
for task in Tasks:
|
| 36 |
-
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", task.value.is_primary_metric)])
|
| 37 |
# Model information
|
| 38 |
auto_eval_column_dict.append(["model_training", ColumnContent, ColumnContent("Type", "str", False)])
|
| 39 |
auto_eval_column_dict.append(["maltese_training", ColumnContent, ColumnContent("Maltese Training", "str", False)])
|
|
|
|
| 20 |
displayed_by_default: bool
|
| 21 |
hidden: bool = False
|
| 22 |
never_hidden: bool = False
|
| 23 |
+
hidden_in_fewshot: bool = False
|
| 24 |
|
| 25 |
## Leaderboard columns
|
| 26 |
auto_eval_column_dict = []
|
| 27 |
# Init
|
| 28 |
auto_eval_column_dict.append(["model_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 29 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 30 |
+
auto_eval_column_dict.append(["n_shot", ColumnContent, ColumnContent("N-Shot", "number", True)])
|
| 31 |
+
auto_eval_column_dict.append(["prompt_version", ColumnContent, ColumnContent("Version", "str", True)])
|
| 32 |
#Scores
|
| 33 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average (All) ⬆️", "number", True)])
|
| 34 |
for task_type in TaskType:
|
| 35 |
auto_eval_column_dict.append([task_type.value.name, ColumnContent, ColumnContent(f"Average ({task_type.value.display_name}) {task_type.value.symbol}", "number", True)])
|
| 36 |
for task in Tasks:
|
| 37 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", task.value.is_primary_metric, hidden_in_fewshot=task.value.zero_shot_only)])
|
| 38 |
# Model information
|
| 39 |
auto_eval_column_dict.append(["model_training", ColumnContent, ColumnContent("Type", "str", False)])
|
| 40 |
auto_eval_column_dict.append(["maltese_training", ColumnContent, ColumnContent("Maltese Training", "str", False)])
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -107,7 +107,7 @@ class EvalResult:
|
|
| 107 |
if task.benchmark not in data or task.metric not in data[task.benchmark]:
|
| 108 |
continue
|
| 109 |
score = data[task.benchmark][task.metric]
|
| 110 |
-
if task.metric in ("
|
| 111 |
score *= 100
|
| 112 |
results[task.benchmark + "_" + task.metric][seed] = score
|
| 113 |
|
|
@@ -185,7 +185,7 @@ class EvalResult:
|
|
| 185 |
for task in Tasks:
|
| 186 |
result = self.results.get(task.value.benchmark + "_" + task.value.metric)
|
| 187 |
data_dict[task.value.col_name] = result
|
| 188 |
-
if task.value.is_primary_metric:
|
| 189 |
results_by_task_type[task.value.task_type].append(result)
|
| 190 |
results_averages = []
|
| 191 |
for task_type, task_type_results in results_by_task_type.items():
|
|
@@ -223,7 +223,7 @@ def get_raw_eval_results(results_path: str) -> list[EvalResult]:
|
|
| 223 |
"""From the path of the results folder root, extract all needed info for results"""
|
| 224 |
model_result_filepaths = defaultdict(lambda: defaultdict(list))
|
| 225 |
|
| 226 |
-
for directory_path in Path(results_path).rglob("
|
| 227 |
for file_path in directory_path.rglob("*-seed/results_*.json"):
|
| 228 |
seed = file_path.parent.name.removesuffix("-seed")
|
| 229 |
model_result_filepaths[directory_path.relative_to(results_path)][seed].append(file_path)
|
|
|
|
| 107 |
if task.benchmark not in data or task.metric not in data[task.benchmark]:
|
| 108 |
continue
|
| 109 |
score = data[task.benchmark][task.metric]
|
| 110 |
+
if task.metric in ("acc", "f1", "loglikelihood", "rouge"):
|
| 111 |
score *= 100
|
| 112 |
results[task.benchmark + "_" + task.metric][seed] = score
|
| 113 |
|
|
|
|
| 185 |
for task in Tasks:
|
| 186 |
result = self.results.get(task.value.benchmark + "_" + task.value.metric)
|
| 187 |
data_dict[task.value.col_name] = result
|
| 188 |
+
if task.value.is_primary_metric and not (task.value.zero_shot_only and self.n_shot > 0):
|
| 189 |
results_by_task_type[task.value.task_type].append(result)
|
| 190 |
results_averages = []
|
| 191 |
for task_type, task_type_results in results_by_task_type.items():
|
|
|
|
| 223 |
"""From the path of the results folder root, extract all needed info for results"""
|
| 224 |
model_result_filepaths = defaultdict(lambda: defaultdict(list))
|
| 225 |
|
| 226 |
+
for directory_path in Path(results_path).rglob("*/*/"):
|
| 227 |
for file_path in directory_path.rglob("*-seed/results_*.json"):
|
| 228 |
seed = file_path.parent.name.removesuffix("-seed")
|
| 229 |
model_result_filepaths[directory_path.relative_to(results_path)][seed].append(file_path)
|
src/populate.py
CHANGED
|
@@ -16,6 +16,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
| 16 |
df = pd.DataFrame.from_records(all_data_json)
|
| 17 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
| 18 |
df = df[cols].round(decimals=2)
|
|
|
|
| 19 |
|
| 20 |
return df
|
| 21 |
|
|
|
|
| 16 |
df = pd.DataFrame.from_records(all_data_json)
|
| 17 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
| 18 |
df = df[cols].round(decimals=2)
|
| 19 |
+
df.dropna(how="all", axis=1, inplace=True)
|
| 20 |
|
| 21 |
return df
|
| 22 |
|