Spaces:
Running
Running
jasonshaoshun
commited on
Commit
·
06e8556
1
Parent(s):
9ebccf5
debug
Browse files- app.py +32 -16
- src/about.py +1 -1
- src/display/utils.py +4 -4
- src/leaderboard/read_evals.py +61 -83
- src/populate.py +6 -34
app.py
CHANGED
|
@@ -30,7 +30,7 @@ from src.display.utils import (
|
|
| 30 |
fields,
|
| 31 |
)
|
| 32 |
from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO, REPO_ID, TOKEN, RESULTS_REPO_MIB_SUBGRAPH, EVAL_RESULTS_MIB_SUBGRAPH_PATH, RESULTS_REPO_MIB_CAUSALGRAPH, EVAL_RESULTS_MIB_CAUSALGRAPH_PATH
|
| 33 |
-
from src.populate import get_evaluation_queue_df, get_leaderboard_df,
|
| 34 |
from src.submission.submit import add_new_eval
|
| 35 |
|
| 36 |
|
|
@@ -49,15 +49,6 @@ try:
|
|
| 49 |
except Exception:
|
| 50 |
restart_space()
|
| 51 |
|
| 52 |
-
# print("EVAL_RESULTS_PATH")
|
| 53 |
-
# try:
|
| 54 |
-
# print(EVAL_RESULTS_PATH)
|
| 55 |
-
# snapshot_download(
|
| 56 |
-
# repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
| 57 |
-
# )
|
| 58 |
-
# except Exception:
|
| 59 |
-
# restart_space()
|
| 60 |
-
|
| 61 |
|
| 62 |
try:
|
| 63 |
print(RESULTS_REPO_MIB_SUBGRAPH)
|
|
@@ -78,8 +69,8 @@ except Exception:
|
|
| 78 |
|
| 79 |
|
| 80 |
|
| 81 |
-
LEADERBOARD_DF_MIB_SUBGRAPH =
|
| 82 |
-
|
| 83 |
|
| 84 |
# LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 85 |
# LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
|
|
@@ -91,7 +82,32 @@ LEADERBOARD_DF_MIB_SUBGRAPH = get_leaderboard_df_mib(EVAL_RESULTS_MIB_SUBGRAPH_P
|
|
| 91 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 92 |
|
| 93 |
|
| 94 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
print(f"init_leaderboard_mib: dataframe head before loc is {dataframe.head()}\n")
|
| 96 |
|
| 97 |
if dataframe is None or dataframe.empty:
|
|
@@ -116,6 +132,7 @@ def init_leaderboard_mib(dataframe, track):
|
|
| 116 |
interactive=False,
|
| 117 |
)
|
| 118 |
|
|
|
|
| 119 |
def init_leaderboard(dataframe, track):
|
| 120 |
if dataframe is None or dataframe.empty:
|
| 121 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
|
@@ -180,11 +197,10 @@ with demo:
|
|
| 180 |
# gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
| 181 |
|
| 182 |
with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
|
| 183 |
-
leaderboard =
|
| 184 |
-
# leaderboard = init_leaderboard_mib(LEADERBOARD_DF, "mib")
|
| 185 |
|
| 186 |
# with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
|
| 187 |
-
# leaderboard =
|
| 188 |
|
| 189 |
# with gr.Row():
|
| 190 |
# with gr.Accordion("📙 Citation", open=False):
|
|
|
|
| 30 |
fields,
|
| 31 |
)
|
| 32 |
from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO, REPO_ID, TOKEN, RESULTS_REPO_MIB_SUBGRAPH, EVAL_RESULTS_MIB_SUBGRAPH_PATH, RESULTS_REPO_MIB_CAUSALGRAPH, EVAL_RESULTS_MIB_CAUSALGRAPH_PATH
|
| 33 |
+
from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_leaderboard_df_mib_subgraph, get_leaderboard_df_mib_causalgraph
|
| 34 |
from src.submission.submit import add_new_eval
|
| 35 |
|
| 36 |
|
|
|
|
| 49 |
except Exception:
|
| 50 |
restart_space()
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
try:
|
| 54 |
print(RESULTS_REPO_MIB_SUBGRAPH)
|
|
|
|
| 69 |
|
| 70 |
|
| 71 |
|
| 72 |
+
LEADERBOARD_DF_MIB_SUBGRAPH = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB, BENCHMARK_COLS_MIB)
|
| 73 |
+
LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB, BENCHMARK_COLS_MIB)
|
| 74 |
|
| 75 |
# LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 76 |
# LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
|
|
|
|
| 82 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 83 |
|
| 84 |
|
| 85 |
+
def init_leaderboard_mib_subgraph(dataframe, track):
|
| 86 |
+
print(f"init_leaderboard_mib: dataframe head before loc is {dataframe.head()}\n")
|
| 87 |
+
|
| 88 |
+
if dataframe is None or dataframe.empty:
|
| 89 |
+
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 90 |
+
|
| 91 |
+
# filter for correct track
|
| 92 |
+
# dataframe = dataframe.loc[dataframe["Track"] == track]
|
| 93 |
+
|
| 94 |
+
print(f"init_leaderboard_mib: dataframe head after loc is {dataframe.head()}\n")
|
| 95 |
+
|
| 96 |
+
return Leaderboard(
|
| 97 |
+
value=dataframe,
|
| 98 |
+
datatype=[c.type for c in fields(AutoEvalColumn_mib)],
|
| 99 |
+
select_columns=SelectColumns(
|
| 100 |
+
default_selection=[c.name for c in fields(AutoEvalColumn_mib) if c.displayed_by_default],
|
| 101 |
+
cant_deselect=[c.name for c in fields(AutoEvalColumn_mib) if c.never_hidden],
|
| 102 |
+
label="Select Columns to Display:",
|
| 103 |
+
),
|
| 104 |
+
search_columns=["Method"], # Changed from AutoEvalColumn_mib.model.name to "Method"
|
| 105 |
+
hide_columns=[c.name for c in fields(AutoEvalColumn_mib) if c.hidden],
|
| 106 |
+
bool_checkboxgroup_label="Hide models",
|
| 107 |
+
interactive=False,
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
def init_leaderboard_mib_causalgraph(dataframe, track):
|
| 111 |
print(f"init_leaderboard_mib: dataframe head before loc is {dataframe.head()}\n")
|
| 112 |
|
| 113 |
if dataframe is None or dataframe.empty:
|
|
|
|
| 132 |
interactive=False,
|
| 133 |
)
|
| 134 |
|
| 135 |
+
|
| 136 |
def init_leaderboard(dataframe, track):
|
| 137 |
if dataframe is None or dataframe.empty:
|
| 138 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
|
|
|
| 197 |
# gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
| 198 |
|
| 199 |
with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
|
| 200 |
+
leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
|
|
|
|
| 201 |
|
| 202 |
# with gr.TabItem("Causal Graph", elem_id="causalgraph", id=1):
|
| 203 |
+
# leaderboard = init_leaderboard_mib_causalgraph(LEADERBOARD_DF_MIB_CAUSALGRAPH, "Causal Graph")
|
| 204 |
|
| 205 |
# with gr.Row():
|
| 206 |
# with gr.Accordion("📙 Citation", open=False):
|
src/about.py
CHANGED
|
@@ -27,7 +27,7 @@ class Tasks(Enum):
|
|
| 27 |
task3 = Task("ewok", "acc", "EWoK")
|
| 28 |
|
| 29 |
|
| 30 |
-
class
|
| 31 |
task0 = TaskMIB("ioi", ["meta_llama", "qwen", "gpt2"], "ioi", ["edge_counts", "faithfulness"])
|
| 32 |
task1 = TaskMIB("mcqa", ["meta_llama", "qwen", "gpt2"], "mcqa", ["edge_counts", "faithfulness"])
|
| 33 |
|
|
|
|
| 27 |
task3 = Task("ewok", "acc", "EWoK")
|
| 28 |
|
| 29 |
|
| 30 |
+
class TasksMib_Subgraph(Enum):
|
| 31 |
task0 = TaskMIB("ioi", ["meta_llama", "qwen", "gpt2"], "ioi", ["edge_counts", "faithfulness"])
|
| 32 |
task1 = TaskMIB("mcqa", ["meta_llama", "qwen", "gpt2"], "mcqa", ["edge_counts", "faithfulness"])
|
| 33 |
|
src/display/utils.py
CHANGED
|
@@ -3,7 +3,7 @@ from enum import Enum
|
|
| 3 |
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
-
from src.about import Tasks, TasksMultimodal,
|
| 7 |
|
| 8 |
def fields(raw_class):
|
| 9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
@@ -35,7 +35,7 @@ auto_eval_column_dict_mib = []
|
|
| 35 |
auto_eval_column_dict_mib.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
|
| 36 |
|
| 37 |
# For each task and model combination
|
| 38 |
-
for task in
|
| 39 |
for model in task.value.models:
|
| 40 |
col_name = f"{task.value.benchmark}_{model}" # ioi_meta_llama, mcqa_qwen, etc.
|
| 41 |
auto_eval_column_dict_mib.append([
|
|
@@ -54,9 +54,9 @@ AutoEvalColumn_mib = make_dataclass("AutoEvalColumn_mib", auto_eval_column_dict_
|
|
| 54 |
# Column selection for display
|
| 55 |
COLS_MIB = [c.name for c in fields(AutoEvalColumn_mib) if not c.hidden]
|
| 56 |
|
| 57 |
-
# BENCHMARK_COLS_MIB = [t.value.col_name for t in
|
| 58 |
BENCHMARK_COLS_MIB = []
|
| 59 |
-
for task in
|
| 60 |
for model in task.value.models:
|
| 61 |
col_name = f"{task.value.col_name}_{model.replace('-', '_')}"
|
| 62 |
BENCHMARK_COLS_MIB.append(col_name)
|
|
|
|
| 3 |
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
+
from src.about import Tasks, TasksMultimodal, TasksMib_Subgraph
|
| 7 |
|
| 8 |
def fields(raw_class):
|
| 9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
|
| 35 |
auto_eval_column_dict_mib.append(["method", ColumnContent, ColumnContent("Method", "markdown", True, never_hidden=True)])
|
| 36 |
|
| 37 |
# For each task and model combination
|
| 38 |
+
for task in TasksMib_Subgraph:
|
| 39 |
for model in task.value.models:
|
| 40 |
col_name = f"{task.value.benchmark}_{model}" # ioi_meta_llama, mcqa_qwen, etc.
|
| 41 |
auto_eval_column_dict_mib.append([
|
|
|
|
| 54 |
# Column selection for display
|
| 55 |
COLS_MIB = [c.name for c in fields(AutoEvalColumn_mib) if not c.hidden]
|
| 56 |
|
| 57 |
+
# BENCHMARK_COLS_MIB = [t.value.col_name for t in TasksMib_Subgraph]
|
| 58 |
BENCHMARK_COLS_MIB = []
|
| 59 |
+
for task in TasksMib_Subgraph:
|
| 60 |
for model in task.value.models:
|
| 61 |
col_name = f"{task.value.col_name}_{model.replace('-', '_')}"
|
| 62 |
BENCHMARK_COLS_MIB.append(col_name)
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -13,29 +13,9 @@ from src.submission.check_validity import is_model_on_hub
|
|
| 13 |
|
| 14 |
|
| 15 |
from typing import List, Dict
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
# def compute_area(edge_counts, faithfulnesses, log_scale=True):
|
| 20 |
-
# percentages = [e / max(edge_counts) for e in edge_counts]
|
| 21 |
-
# area_under = 0.
|
| 22 |
-
# area_from_100 = 0.
|
| 23 |
-
# for i in range(len(faithfulnesses) - 1):
|
| 24 |
-
# i_1, i_2 = i, i+1
|
| 25 |
-
# x_1 = percentages[i_1]
|
| 26 |
-
# x_2 = percentages[i_2]
|
| 27 |
-
# # area from point to 100
|
| 28 |
-
# if log_scale:
|
| 29 |
-
# x_1 = math.log(x_1)
|
| 30 |
-
# x_2 = math.log(x_2)
|
| 31 |
-
# trapezoidal = (percentages[i_2] - percentages[i_1]) * \
|
| 32 |
-
# (((abs(1. - faithfulnesses[i_1])) + (abs(1. - faithfulnesses[i_2]))) / 2)
|
| 33 |
-
# area_from_100 += trapezoidal
|
| 34 |
-
|
| 35 |
-
# trapezoidal = (percentages[i_2] - percentages[i_1]) * ((faithfulnesses[i_1] + faithfulnesses[i_2]) / 2)
|
| 36 |
-
# area_under += trapezoidal
|
| 37 |
-
# average = sum(faithfulnesses) / len(faithfulnesses)
|
| 38 |
-
# return (area_under, area_from_100, average)
|
| 39 |
def compute_area(edge_counts, faithfulnesses, log_scale=True):
|
| 40 |
# Return None if either list is empty
|
| 41 |
if not edge_counts or not faithfulnesses:
|
|
@@ -62,7 +42,7 @@ def compute_area(edge_counts, faithfulnesses, log_scale=True):
|
|
| 62 |
return (area_under, area_from_100, average)
|
| 63 |
|
| 64 |
@dataclass
|
| 65 |
-
class
|
| 66 |
"""Represents one full evaluation for a method across all models in MIB."""
|
| 67 |
eval_name: str # method name as identifier
|
| 68 |
method_name: str # name of the interpretation method
|
|
@@ -104,63 +84,13 @@ class EvalResult_MIB:
|
|
| 104 |
"faithfulness": scores[task]["faithfulness"]
|
| 105 |
}
|
| 106 |
|
| 107 |
-
return
|
| 108 |
eval_name=method_name,
|
| 109 |
method_name=method_name,
|
| 110 |
results=results
|
| 111 |
)
|
| 112 |
|
| 113 |
|
| 114 |
-
|
| 115 |
-
# def to_dict(self):
|
| 116 |
-
# """Converts the Eval Result to a dict for dataframe display"""
|
| 117 |
-
# data_dict = {
|
| 118 |
-
# "eval_name": self.eval_name,
|
| 119 |
-
# "Method": self.method_name,
|
| 120 |
-
# }
|
| 121 |
-
|
| 122 |
-
# all_scores = []
|
| 123 |
-
# required_entries = {
|
| 124 |
-
# 'ioi_meta_llama': False,
|
| 125 |
-
# 'ioi_qwen': False,
|
| 126 |
-
# 'ioi_gpt2': False,
|
| 127 |
-
# 'mcqa_meta_llama': False,
|
| 128 |
-
# 'mcqa_qwen': False,
|
| 129 |
-
# 'mcqa_gpt2': False
|
| 130 |
-
# }
|
| 131 |
-
|
| 132 |
-
# # For each task (ioi, mcqa)
|
| 133 |
-
# for task, task_results in self.results.items():
|
| 134 |
-
# # Get the models that have results for this task
|
| 135 |
-
# models = task_results.keys()
|
| 136 |
-
|
| 137 |
-
# for model in models:
|
| 138 |
-
# col_name = f"{task}_{model}"
|
| 139 |
-
# metrics = task_results[model]
|
| 140 |
-
# if metrics:
|
| 141 |
-
# edge_counts = metrics["edge_counts"]
|
| 142 |
-
# faithfulness = metrics["faithfulness"]
|
| 143 |
-
# if isinstance(faithfulness[0], list):
|
| 144 |
-
# faithfulness = faithfulness[0]
|
| 145 |
-
|
| 146 |
-
# # Use compute_area
|
| 147 |
-
# area_under, area_from_100, avg = compute_area(edge_counts, faithfulness)
|
| 148 |
-
# score = area_under * 100
|
| 149 |
-
# data_dict[col_name] = round(score, 2)
|
| 150 |
-
# all_scores.append(score)
|
| 151 |
-
# required_entries[col_name] = True
|
| 152 |
-
# else:
|
| 153 |
-
# data_dict[col_name] = '-'
|
| 154 |
-
|
| 155 |
-
# # Only show average if all six required entries are present
|
| 156 |
-
# if all(required_entries.values()):
|
| 157 |
-
# data_dict["Average"] = round(np.mean(all_scores), 2)
|
| 158 |
-
# else:
|
| 159 |
-
# data_dict["Average"] = '-'
|
| 160 |
-
|
| 161 |
-
# return data_dict
|
| 162 |
-
|
| 163 |
-
|
| 164 |
|
| 165 |
def to_dict(self):
|
| 166 |
"""Converts the Eval Result to a dict for dataframe display"""
|
|
@@ -211,13 +141,7 @@ class EvalResult_MIB:
|
|
| 211 |
return data_dict
|
| 212 |
|
| 213 |
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
def get_raw_eval_results_mib(results_path: str, requests_path: str) -> List[EvalResult_MIB]:
|
| 221 |
"""From the path of the results folder root, extract all needed info for MIB results"""
|
| 222 |
model_result_filepaths = []
|
| 223 |
|
|
@@ -243,7 +167,7 @@ def get_raw_eval_results_mib(results_path: str, requests_path: str) -> List[Eval
|
|
| 243 |
eval_results = []
|
| 244 |
for model_result_filepath in model_result_filepaths:
|
| 245 |
try:
|
| 246 |
-
eval_result =
|
| 247 |
result = eval_result.init_from_json_file(model_result_filepath)
|
| 248 |
print(f"eval_result.init_from_json_file(model_result_filepath) is {result}")
|
| 249 |
# Verify the result can be converted to dict format
|
|
@@ -264,6 +188,60 @@ def get_raw_eval_results_mib(results_path: str, requests_path: str) -> List[Eval
|
|
| 264 |
|
| 265 |
|
| 266 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
@dataclass
|
| 268 |
class EvalResult:
|
| 269 |
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
from typing import List, Dict
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
def compute_area(edge_counts, faithfulnesses, log_scale=True):
|
| 20 |
# Return None if either list is empty
|
| 21 |
if not edge_counts or not faithfulnesses:
|
|
|
|
| 42 |
return (area_under, area_from_100, average)
|
| 43 |
|
| 44 |
@dataclass
|
| 45 |
+
class EvalResult_MIB_SUBGRAPH:
|
| 46 |
"""Represents one full evaluation for a method across all models in MIB."""
|
| 47 |
eval_name: str # method name as identifier
|
| 48 |
method_name: str # name of the interpretation method
|
|
|
|
| 84 |
"faithfulness": scores[task]["faithfulness"]
|
| 85 |
}
|
| 86 |
|
| 87 |
+
return EvalResult_MIB_SUBGRAPH(
|
| 88 |
eval_name=method_name,
|
| 89 |
method_name=method_name,
|
| 90 |
results=results
|
| 91 |
)
|
| 92 |
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
def to_dict(self):
|
| 96 |
"""Converts the Eval Result to a dict for dataframe display"""
|
|
|
|
| 141 |
return data_dict
|
| 142 |
|
| 143 |
|
| 144 |
+
def get_raw_eval_results_mib_subgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_SUBGRAPH]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
"""From the path of the results folder root, extract all needed info for MIB results"""
|
| 146 |
model_result_filepaths = []
|
| 147 |
|
|
|
|
| 167 |
eval_results = []
|
| 168 |
for model_result_filepath in model_result_filepaths:
|
| 169 |
try:
|
| 170 |
+
eval_result = EvalResult_MIB_SUBGRAPH("", "", {}) # Create empty instance
|
| 171 |
result = eval_result.init_from_json_file(model_result_filepath)
|
| 172 |
print(f"eval_result.init_from_json_file(model_result_filepath) is {result}")
|
| 173 |
# Verify the result can be converted to dict format
|
|
|
|
| 188 |
|
| 189 |
|
| 190 |
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
@dataclass
|
| 202 |
+
class EvalResult_MIB_CAUSALGRAPH:
|
| 203 |
+
"""Represents one full evaluation for a method across all models in MIB."""
|
| 204 |
+
eval_name: str # method name as identifier
|
| 205 |
+
method_name: str # name of the interpretation method
|
| 206 |
+
results: Dict # nested dict of results {task: {model: {metric: scores}}}
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
def init_from_json_file(self, json_filepath):
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
def to_dict(self):
|
| 215 |
+
|
| 216 |
+
return data_dict
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
def get_raw_eval_results_mib_causalgraph(results_path: str, requests_path: str) -> List[EvalResult_MIB_CAUSALGRAPH]:
|
| 220 |
+
"""From the path of the results folder root, extract all needed info for MIB results"""
|
| 221 |
+
model_result_filepaths = []
|
| 222 |
+
|
| 223 |
+
print(f"results_path is {results_path}")
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
return eval_results
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
|
| 245 |
@dataclass
|
| 246 |
class EvalResult:
|
| 247 |
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
src/populate.py
CHANGED
|
@@ -5,7 +5,7 @@ import pandas as pd
|
|
| 5 |
|
| 6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
| 7 |
from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, EvalQueueColumn
|
| 8 |
-
from src.leaderboard.read_evals import get_raw_eval_results,
|
| 9 |
|
| 10 |
|
| 11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
|
@@ -42,39 +42,10 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
| 42 |
|
| 43 |
|
| 44 |
|
| 45 |
-
|
| 46 |
-
# """Creates a dataframe from all the individual experiment results"""
|
| 47 |
-
# print(f"results_path is {results_path}, requests_path is {requests_path}")
|
| 48 |
-
# raw_data = get_raw_eval_results(results_path, requests_path)
|
| 49 |
-
# print(f"raw_data is {raw_data}")
|
| 50 |
-
# all_data_json = [v.to_dict() for v in raw_data]
|
| 51 |
-
# print(f"all_data_json is {pd.DataFrame.from_records(all_data_json)}")
|
| 52 |
-
# all_data_json_filtered = []
|
| 53 |
-
# for item in all_data_json:
|
| 54 |
-
# item["Track"] = item["eval_name"].split("_")[-1]
|
| 55 |
-
# if "VQA" in benchmark_cols and "VQA" in item:
|
| 56 |
-
# all_data_json_filtered.append(item)
|
| 57 |
-
# if "VQA" not in benchmark_cols and "VQA" not in item:
|
| 58 |
-
# all_data_json_filtered.append(item)
|
| 59 |
-
# all_data_json_filtered.append(item)
|
| 60 |
-
|
| 61 |
-
# all_data_json = all_data_json_filtered
|
| 62 |
-
|
| 63 |
-
# df = pd.DataFrame.from_records(all_data_json)
|
| 64 |
-
# df = df.sort_values(by=[AutoEvalColumn.text_average.name], ascending=False)
|
| 65 |
-
|
| 66 |
-
# print(f"df is {df}")
|
| 67 |
-
|
| 68 |
-
# df = df[cols].round(decimals=1)
|
| 69 |
-
|
| 70 |
-
# # filter out if any of the benchmarks have not been produced
|
| 71 |
-
# df = df[has_no_nan_values(df, benchmark_cols)]
|
| 72 |
-
# return df
|
| 73 |
-
|
| 74 |
-
def get_leaderboard_df_mib(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
| 75 |
"""Creates a dataframe from all the MIB experiment results"""
|
| 76 |
print(f"results_path is {results_path}, requests_path is {requests_path}")
|
| 77 |
-
raw_data =
|
| 78 |
print(f"raw_data is {raw_data}")
|
| 79 |
|
| 80 |
# Convert each result to dict format
|
|
@@ -94,10 +65,11 @@ def get_leaderboard_df_mib(results_path: str, requests_path: str, cols: list, be
|
|
| 94 |
|
| 95 |
return df
|
| 96 |
|
| 97 |
-
def
|
| 98 |
"""Creates a dataframe from all the MIB experiment results"""
|
| 99 |
print(f"results_path is {results_path}, requests_path is {requests_path}")
|
| 100 |
-
raw_data =
|
|
|
|
| 101 |
return raw_data
|
| 102 |
|
| 103 |
|
|
|
|
| 5 |
|
| 6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
| 7 |
from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, EvalQueueColumn
|
| 8 |
+
from src.leaderboard.read_evals import get_raw_eval_results, get_raw_eval_results_mib_subgraph, get_raw_eval_results_mib_causalgraph
|
| 9 |
|
| 10 |
|
| 11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
|
|
|
| 42 |
|
| 43 |
|
| 44 |
|
| 45 |
+
def get_leaderboard_df_mib_subgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
"""Creates a dataframe from all the MIB experiment results"""
|
| 47 |
print(f"results_path is {results_path}, requests_path is {requests_path}")
|
| 48 |
+
raw_data = get_raw_eval_results_mib_subgraph(results_path, requests_path)
|
| 49 |
print(f"raw_data is {raw_data}")
|
| 50 |
|
| 51 |
# Convert each result to dict format
|
|
|
|
| 65 |
|
| 66 |
return df
|
| 67 |
|
| 68 |
+
def get_leaderboard_df_mib_causalgraph(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
| 69 |
"""Creates a dataframe from all the MIB experiment results"""
|
| 70 |
print(f"results_path is {results_path}, requests_path is {requests_path}")
|
| 71 |
+
raw_data = get_raw_eval_results_mib_causalgraph(results_path, requests_path)
|
| 72 |
+
# Implement the rest of the code
|
| 73 |
return raw_data
|
| 74 |
|
| 75 |
|