KurtMica commited on
Commit
17ca318
·
1 Parent(s): d088b76

Separate zero-shot & few-shot results.

Browse files
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import gradio as gr
2
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
@@ -48,7 +49,8 @@ except Exception:
48
  restart_space()
49
 
50
 
51
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
52
 
53
  (
54
  finished_eval_queue_df,
@@ -56,14 +58,14 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
56
  pending_eval_queue_df,
57
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
58
 
59
- def init_leaderboard(dataframe):
60
  if dataframe is None or dataframe.empty:
61
  raise ValueError("Leaderboard DataFrame is empty or None.")
62
  return Leaderboard(
63
  value=dataframe,
64
  datatype=[c.type for c in fields(AutoEvalColumn)],
65
  select_columns=SelectColumns(
66
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
67
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
68
  label="Select Columns to Display:",
69
  ),
@@ -104,7 +106,10 @@ with demo:
104
 
105
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
106
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
107
- leaderboard = init_leaderboard(LEADERBOARD_DF)
 
 
 
108
 
109
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
110
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
1
+ import os
2
  import gradio as gr
3
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
4
  import pandas as pd
 
49
  restart_space()
50
 
51
 
52
+ LEADERBOARD_0_SHOT_DF = get_leaderboard_df(os.path.join(EVAL_RESULTS_PATH, "zero-shot"), EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
+ LEADERBOARD_1_SHOT_DF = get_leaderboard_df(os.path.join(EVAL_RESULTS_PATH, "few-shot"), EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
54
 
55
  (
56
  finished_eval_queue_df,
 
58
  pending_eval_queue_df,
59
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
60
 
61
+ def init_leaderboard(dataframe, fewshot=True):
62
  if dataframe is None or dataframe.empty:
63
  raise ValueError("Leaderboard DataFrame is empty or None.")
64
  return Leaderboard(
65
  value=dataframe,
66
  datatype=[c.type for c in fields(AutoEvalColumn)],
67
  select_columns=SelectColumns(
68
+ default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not (fewshot and c.hidden_in_fewshot)],
69
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
70
  label="Select Columns to Display:",
71
  ),
 
106
 
107
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
108
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
109
+ with gr.TabItem("Zero-Shot", elem_id="zero-shot"):
110
+ leaderboard = init_leaderboard(LEADERBOARD_0_SHOT_DF, fewshot=False)
111
+ with gr.TabItem("Few-Shot", elem_id="few-shot"):
112
+ leaderboard = init_leaderboard(LEADERBOARD_1_SHOT_DF, fewshot=True)
113
 
114
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
115
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
src/about.py CHANGED
@@ -23,18 +23,19 @@ class Task:
23
  url: str
24
  task_type: TaskType
25
  is_primary_metric: bool = True
 
26
 
27
 
28
  # Select your tasks here
29
  # ---------------------------------------------------
30
  class Tasks(Enum):
31
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
32
  task0 = Task("sentiment_mlt", "f1", "Sentiment Analysis (F1)", "https://github.com/jerbarnes/typology_of_crosslingual/tree/master/data/sentiment/mt", TaskType.NLU)
33
  task1 = Task("sib200_mlt", "f1", "SIB200 (F1)", "https://huggingface.co/datasets/Davlan/sib200/viewer/mlt_Latn", TaskType.NLU)
34
  task2 = Task("taxi1500_mlt", "f1", "Taxi1500 (F1)", "https://github.com/cisnlp/Taxi1500", TaskType.NLU)
35
  task3 = Task("maltese_news_categories", "loglikelihood", "Maltese News Categories (F1)", "https://huggingface.co/datasets/MLRS/maltese_news_categories", TaskType.NLU)
36
  task4 = Task("multieurlex_mlt", "loglikelihood", "MultiEURLEX (F1)", "https://huggingface.co/datasets/nlpaueb/multi_eurlex", TaskType.NLU)
37
- task5 = Task("belebele_mlt", "acc", "Belebele (Accuracy)", "https://huggingface.co/datasets/facebook/belebele/viewer/mlt_Latn", TaskType.NLU)
38
  task6 = Task("opus100_eng-mlt", "bleu", "OPUS-100 EN→MT (BLEU)", "https://huggingface.co/datasets/MLRS/OPUS-MT-EN-Fixed", TaskType.NLG, False)
39
  task7 = Task("opus100_eng-mlt", "chrf", "OPUS-100 EN→MT (ChrF)", "https://huggingface.co/datasets/MLRS/OPUS-MT-EN-Fixed", TaskType.NLG)
40
  task8 = Task("flores200_eng-mlt", "bleu", "Flores-200 EN→MT (BLEU)", "https://huggingface.co/datasets/Muennighoff/flores200", TaskType.NLG, False)
 
23
  url: str
24
  task_type: TaskType
25
  is_primary_metric: bool = True
26
+ zero_shot_only: bool = False
27
 
28
 
29
  # Select your tasks here
30
  # ---------------------------------------------------
31
  class Tasks(Enum):
32
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
33
  task0 = Task("sentiment_mlt", "f1", "Sentiment Analysis (F1)", "https://github.com/jerbarnes/typology_of_crosslingual/tree/master/data/sentiment/mt", TaskType.NLU)
34
  task1 = Task("sib200_mlt", "f1", "SIB200 (F1)", "https://huggingface.co/datasets/Davlan/sib200/viewer/mlt_Latn", TaskType.NLU)
35
  task2 = Task("taxi1500_mlt", "f1", "Taxi1500 (F1)", "https://github.com/cisnlp/Taxi1500", TaskType.NLU)
36
  task3 = Task("maltese_news_categories", "loglikelihood", "Maltese News Categories (F1)", "https://huggingface.co/datasets/MLRS/maltese_news_categories", TaskType.NLU)
37
  task4 = Task("multieurlex_mlt", "loglikelihood", "MultiEURLEX (F1)", "https://huggingface.co/datasets/nlpaueb/multi_eurlex", TaskType.NLU)
38
+ task5 = Task("belebele_mlt", "acc", "Belebele (Accuracy)", "https://huggingface.co/datasets/facebook/belebele/viewer/mlt_Latn", TaskType.NLU, zero_shot_only=True)
39
  task6 = Task("opus100_eng-mlt", "bleu", "OPUS-100 EN→MT (BLEU)", "https://huggingface.co/datasets/MLRS/OPUS-MT-EN-Fixed", TaskType.NLG, False)
40
  task7 = Task("opus100_eng-mlt", "chrf", "OPUS-100 EN→MT (ChrF)", "https://huggingface.co/datasets/MLRS/OPUS-MT-EN-Fixed", TaskType.NLG)
41
  task8 = Task("flores200_eng-mlt", "bleu", "Flores-200 EN→MT (BLEU)", "https://huggingface.co/datasets/Muennighoff/flores200", TaskType.NLG, False)
src/display/utils.py CHANGED
@@ -20,20 +20,21 @@ class ColumnContent:
20
  displayed_by_default: bool
21
  hidden: bool = False
22
  never_hidden: bool = False
 
23
 
24
  ## Leaderboard columns
25
  auto_eval_column_dict = []
26
  # Init
27
  auto_eval_column_dict.append(["model_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
28
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
29
- auto_eval_column_dict.append(["n_shot", ColumnContent, ColumnContent("N-Shot", "number", False)])
30
- auto_eval_column_dict.append(["prompt_version", ColumnContent, ColumnContent("Version", "str", False)])
31
  #Scores
32
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average (All) ⬆️", "number", True)])
33
  for task_type in TaskType:
34
  auto_eval_column_dict.append([task_type.value.name, ColumnContent, ColumnContent(f"Average ({task_type.value.display_name}) {task_type.value.symbol}", "number", True)])
35
  for task in Tasks:
36
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", task.value.is_primary_metric)])
37
  # Model information
38
  auto_eval_column_dict.append(["model_training", ColumnContent, ColumnContent("Type", "str", False)])
39
  auto_eval_column_dict.append(["maltese_training", ColumnContent, ColumnContent("Maltese Training", "str", False)])
 
20
  displayed_by_default: bool
21
  hidden: bool = False
22
  never_hidden: bool = False
23
+ hidden_in_fewshot: bool = False
24
 
25
  ## Leaderboard columns
26
  auto_eval_column_dict = []
27
  # Init
28
  auto_eval_column_dict.append(["model_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
29
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
30
+ auto_eval_column_dict.append(["n_shot", ColumnContent, ColumnContent("N-Shot", "number", True)])
31
+ auto_eval_column_dict.append(["prompt_version", ColumnContent, ColumnContent("Version", "str", True)])
32
  #Scores
33
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average (All) ⬆️", "number", True)])
34
  for task_type in TaskType:
35
  auto_eval_column_dict.append([task_type.value.name, ColumnContent, ColumnContent(f"Average ({task_type.value.display_name}) {task_type.value.symbol}", "number", True)])
36
  for task in Tasks:
37
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", task.value.is_primary_metric, hidden_in_fewshot=task.value.zero_shot_only)])
38
  # Model information
39
  auto_eval_column_dict.append(["model_training", ColumnContent, ColumnContent("Type", "str", False)])
40
  auto_eval_column_dict.append(["maltese_training", ColumnContent, ColumnContent("Maltese Training", "str", False)])
src/leaderboard/read_evals.py CHANGED
@@ -107,7 +107,7 @@ class EvalResult:
107
  if task.benchmark not in data or task.metric not in data[task.benchmark]:
108
  continue
109
  score = data[task.benchmark][task.metric]
110
- if task.metric in ("accuracy", "f1", "loglikelihood", "rouge"):
111
  score *= 100
112
  results[task.benchmark + "_" + task.metric][seed] = score
113
 
@@ -185,7 +185,7 @@ class EvalResult:
185
  for task in Tasks:
186
  result = self.results.get(task.value.benchmark + "_" + task.value.metric)
187
  data_dict[task.value.col_name] = result
188
- if task.value.is_primary_metric:
189
  results_by_task_type[task.value.task_type].append(result)
190
  results_averages = []
191
  for task_type, task_type_results in results_by_task_type.items():
@@ -223,7 +223,7 @@ def get_raw_eval_results(results_path: str) -> list[EvalResult]:
223
  """From the path of the results folder root, extract all needed info for results"""
224
  model_result_filepaths = defaultdict(lambda: defaultdict(list))
225
 
226
- for directory_path in Path(results_path).rglob("*-shot/*/*/"):
227
  for file_path in directory_path.rglob("*-seed/results_*.json"):
228
  seed = file_path.parent.name.removesuffix("-seed")
229
  model_result_filepaths[directory_path.relative_to(results_path)][seed].append(file_path)
 
107
  if task.benchmark not in data or task.metric not in data[task.benchmark]:
108
  continue
109
  score = data[task.benchmark][task.metric]
110
+ if task.metric in ("acc", "f1", "loglikelihood", "rouge"):
111
  score *= 100
112
  results[task.benchmark + "_" + task.metric][seed] = score
113
 
 
185
  for task in Tasks:
186
  result = self.results.get(task.value.benchmark + "_" + task.value.metric)
187
  data_dict[task.value.col_name] = result
188
+ if task.value.is_primary_metric and not (task.value.zero_shot_only and self.n_shot > 0):
189
  results_by_task_type[task.value.task_type].append(result)
190
  results_averages = []
191
  for task_type, task_type_results in results_by_task_type.items():
 
223
  """From the path of the results folder root, extract all needed info for results"""
224
  model_result_filepaths = defaultdict(lambda: defaultdict(list))
225
 
226
+ for directory_path in Path(results_path).rglob("*/*/"):
227
  for file_path in directory_path.rglob("*-seed/results_*.json"):
228
  seed = file_path.parent.name.removesuffix("-seed")
229
  model_result_filepaths[directory_path.relative_to(results_path)][seed].append(file_path)
src/populate.py CHANGED
@@ -16,6 +16,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
16
  df = pd.DataFrame.from_records(all_data_json)
17
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
  df = df[cols].round(decimals=2)
 
19
 
20
  return df
21
 
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
  df = df[cols].round(decimals=2)
19
+ df.dropna(how="all", axis=1, inplace=True)
20
 
21
  return df
22