Spaces:
Running
Running
lixuejing
commited on
Commit
·
6500fc4
1
Parent(s):
33927d7
update
Browse files- app.py +17 -11
- src/about.py +2 -0
- src/display/utils.py +29 -1
app.py
CHANGED
|
@@ -24,7 +24,11 @@ from src.display.utils import (
|
|
| 24 |
fields,
|
| 25 |
WeightType,
|
| 26 |
Precision,
|
| 27 |
-
NUMERIC_INTERVALS
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
)
|
| 29 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, DYNAMIC_INFO_REPO, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_PATH, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 30 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
|
@@ -32,10 +36,10 @@ from src.submission.submit import add_new_eval
|
|
| 32 |
from src.scripts.update_all_request_files import update_dynamic_files
|
| 33 |
from src.tools.collections import update_collections
|
| 34 |
from src.tools.datastatics import get_statics
|
| 35 |
-
from src.tools.plots import (
|
| 36 |
-
create_plot_df,
|
| 37 |
-
create_scores_df,
|
| 38 |
-
)
|
| 39 |
|
| 40 |
def restart_space():
|
| 41 |
API.restart_space(repo_id=REPO_ID, token=TOKEN)
|
|
@@ -60,17 +64,18 @@ def init_space():
|
|
| 60 |
restart_space()
|
| 61 |
|
| 62 |
raw_data, original_df = get_leaderboard_df(
|
| 63 |
-
#leaderboard_df = get_leaderboard_df(
|
| 64 |
results_path=EVAL_RESULTS_PATH,
|
| 65 |
requests_path=EVAL_REQUESTS_PATH,
|
| 66 |
dynamic_path=DYNAMIC_INFO_FILE_PATH,
|
| 67 |
-
cols=COLS,
|
| 68 |
-
benchmark_cols=BENCHMARK_COLS
|
|
|
|
|
|
|
| 69 |
)
|
| 70 |
update_collections(original_df.copy())
|
| 71 |
leaderboard_df = original_df.copy()
|
| 72 |
|
| 73 |
-
plot_df = create_plot_df(create_scores_df(raw_data))
|
| 74 |
|
| 75 |
(
|
| 76 |
finished_eval_queue_df,
|
|
@@ -78,9 +83,10 @@ def init_space():
|
|
| 78 |
pending_eval_queue_df,
|
| 79 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 80 |
|
| 81 |
-
return leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
|
|
|
| 82 |
|
| 83 |
-
leaderboard_df, original_df,
|
| 84 |
#return leaderboard_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
| 85 |
|
| 86 |
#leaderboard_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
|
|
|
|
| 24 |
fields,
|
| 25 |
WeightType,
|
| 26 |
Precision,
|
| 27 |
+
NUMERIC_INTERVALS,
|
| 28 |
+
QUOTACOLS,
|
| 29 |
+
QUOTATYPES,
|
| 30 |
+
AutoEvalColumnQuota,
|
| 31 |
+
BENCHMARK_QUOTACOLS
|
| 32 |
)
|
| 33 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, DYNAMIC_INFO_REPO, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_PATH, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 34 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
|
|
|
| 36 |
from src.scripts.update_all_request_files import update_dynamic_files
|
| 37 |
from src.tools.collections import update_collections
|
| 38 |
from src.tools.datastatics import get_statics
|
| 39 |
+
#from src.tools.plots import (
|
| 40 |
+
# create_plot_df,
|
| 41 |
+
# create_scores_df,
|
| 42 |
+
#)
|
| 43 |
|
| 44 |
def restart_space():
|
| 45 |
API.restart_space(repo_id=REPO_ID, token=TOKEN)
|
|
|
|
| 64 |
restart_space()
|
| 65 |
|
| 66 |
raw_data, original_df = get_leaderboard_df(
|
|
|
|
| 67 |
results_path=EVAL_RESULTS_PATH,
|
| 68 |
requests_path=EVAL_REQUESTS_PATH,
|
| 69 |
dynamic_path=DYNAMIC_INFO_FILE_PATH,
|
| 70 |
+
#cols=COLS,
|
| 71 |
+
#benchmark_cols=BENCHMARK_COLS,
|
| 72 |
+
cols=QUOTACOLS,
|
| 73 |
+
benchmark_cols=BENCHMARK_QUOTACOLS
|
| 74 |
)
|
| 75 |
update_collections(original_df.copy())
|
| 76 |
leaderboard_df = original_df.copy()
|
| 77 |
|
| 78 |
+
#plot_df = create_plot_df(create_scores_df(raw_data))
|
| 79 |
|
| 80 |
(
|
| 81 |
finished_eval_queue_df,
|
|
|
|
| 83 |
pending_eval_queue_df,
|
| 84 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 85 |
|
| 86 |
+
#return leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
| 87 |
+
return leaderboard_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
| 88 |
|
| 89 |
+
leaderboard_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
|
| 90 |
#return leaderboard_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
| 91 |
|
| 92 |
#leaderboard_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
|
src/about.py
CHANGED
|
@@ -22,6 +22,8 @@ class Tasks(Enum):
|
|
| 22 |
SAT = Task("SAT", "overall", "SAT")
|
| 23 |
egoplan_bench2 = Task("egoplan_bench2", "overall", "egoplan_bench2")
|
| 24 |
erqa = Task("erqa", "overall", "erqa")
|
|
|
|
|
|
|
| 25 |
Perception = Task("Perception", "overall", "Perception")
|
| 26 |
SpatialReasoning = Task("SpatialReasoning", "overall", "SpatialReasoning")
|
| 27 |
Prediction = Task("Prediction", "overall", "Prediction")
|
|
|
|
| 22 |
SAT = Task("SAT", "overall", "SAT")
|
| 23 |
egoplan_bench2 = Task("egoplan_bench2", "overall", "egoplan_bench2")
|
| 24 |
erqa = Task("erqa", "overall", "erqa")
|
| 25 |
+
|
| 26 |
+
class Quotas(Enum):
|
| 27 |
Perception = Task("Perception", "overall", "Perception")
|
| 28 |
SpatialReasoning = Task("SpatialReasoning", "overall", "SpatialReasoning")
|
| 29 |
Prediction = Task("Prediction", "overall", "Prediction")
|
src/display/utils.py
CHANGED
|
@@ -3,7 +3,7 @@ from enum import Enum
|
|
| 3 |
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
-
from src.about import Tasks
|
| 7 |
|
| 8 |
def fields(raw_class):
|
| 9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
@@ -44,6 +44,30 @@ auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_
|
|
| 44 |
# We use make dataclass to dynamically fill the scores from Tasks
|
| 45 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
## For the queue columns in the submission tab
|
| 48 |
@dataclass(frozen=True)
|
| 49 |
class EvalQueueColumn: # Queue column
|
|
@@ -116,10 +140,14 @@ class Precision(Enum):
|
|
| 116 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
| 117 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
| 118 |
|
|
|
|
|
|
|
|
|
|
| 119 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
| 120 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 121 |
|
| 122 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
|
|
|
| 123 |
|
| 124 |
NUMERIC_INTERVALS = {
|
| 125 |
"?": pd.Interval(-1, 0, closed="right"),
|
|
|
|
| 3 |
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
+
from src.about import Tasks,Quotas
|
| 7 |
|
| 8 |
def fields(raw_class):
|
| 9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
|
| 44 |
# We use make dataclass to dynamically fill the scores from Tasks
|
| 45 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
| 46 |
|
| 47 |
+
|
| 48 |
+
## Leaderboard columns
|
| 49 |
+
auto_eval_column_quota_dict = []
|
| 50 |
+
# Init
|
| 51 |
+
auto_eval_column_quota_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 52 |
+
auto_eval_column_quota_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 53 |
+
#Scores
|
| 54 |
+
auto_eval_column_quota_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
| 55 |
+
for task in Quotas:
|
| 56 |
+
auto_eval_column_quota_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
| 57 |
+
# Model information
|
| 58 |
+
auto_eval_column_quota_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
| 59 |
+
auto_eval_column_quota_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
| 60 |
+
auto_eval_column_quota_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
| 61 |
+
auto_eval_column_quota_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
| 62 |
+
auto_eval_column_quota_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
| 63 |
+
auto_eval_column_quota_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
|
| 64 |
+
auto_eval_column_quota_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
|
| 65 |
+
# Dummy column for the search bar (hidden by the custom CSS)
|
| 66 |
+
auto_eval_column_quota_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
| 67 |
+
|
| 68 |
+
# We use make dataclass to dynamically fill the scores from Tasks
|
| 69 |
+
AutoEvalColumnQuota = make_dataclass("AutoEvalColumnQuota", auto_eval_column_quota_dict, frozen=True)
|
| 70 |
+
|
| 71 |
## For the queue columns in the submission tab
|
| 72 |
@dataclass(frozen=True)
|
| 73 |
class EvalQueueColumn: # Queue column
|
|
|
|
| 140 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
| 141 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
| 142 |
|
| 143 |
+
QUOTACOLS = [c.name for c in fields(AutoEvalColumnQuota) if not c.hidden]
|
| 144 |
+
QUOTATYPES = [c.type for c in fields(AutoEvalColumnQuota) if not c.hidden]
|
| 145 |
+
|
| 146 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
| 147 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 148 |
|
| 149 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
| 150 |
+
BENCHMARK_QUOTACOLS = [t.value.col_name for t in Quotas]
|
| 151 |
|
| 152 |
NUMERIC_INTERVALS = {
|
| 153 |
"?": pd.Interval(-1, 0, closed="right"),
|