Spaces:
Running
Running
add option to exclude environment from mean
Browse files
app.py
CHANGED
|
@@ -28,6 +28,15 @@ def filter_dfs(tags, lb):
|
|
| 28 |
lb = lb[lb["Tags"].apply(lambda x: any(tag in x for tag in tags))]
|
| 29 |
return lb
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
def restart_space():
|
| 33 |
API.restart_space(repo_id=REPO_ID)
|
|
@@ -129,7 +138,8 @@ agg_df = BenchmarkSuite.aggregate_df(results_df)
|
|
| 129 |
agg_df = agg_df.pivot(index="dataset", columns="benchmark_category", values="score")
|
| 130 |
agg_df.rename(columns={"OVERALL": "General"}, inplace=True)
|
| 131 |
agg_df.columns = [x.capitalize() for x in agg_df.columns]
|
| 132 |
-
agg_df["Mean"
|
|
|
|
| 133 |
# make sure mean is the first column
|
| 134 |
agg_df = agg_df[["Mean"] + [col for col in agg_df.columns if col != "Mean"]]
|
| 135 |
for col in agg_df.columns:
|
|
@@ -212,6 +222,9 @@ app = gr.Blocks(css=custom_css, title="TTS Benchmark Leaderboard")
|
|
| 212 |
with app:
|
| 213 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 214 |
with gr.TabItem("🏅 TTSDB Scores", elem_id="llm-benchmark-tab-table", id=0):
|
|
|
|
|
|
|
|
|
|
| 215 |
tags = gr.Dropdown(
|
| 216 |
TAGS,
|
| 217 |
value=[],
|
|
@@ -221,6 +234,7 @@ with app:
|
|
| 221 |
)
|
| 222 |
leaderboard = init_leaderboard(f_a_df)
|
| 223 |
tags.change(filter_dfs, [tags, leaderboard], [leaderboard])
|
|
|
|
| 224 |
with gr.TabItem("🏅 Individual Benchmarks", elem_id="llm-benchmark-tab-table", id=1):
|
| 225 |
tags = gr.Dropdown(
|
| 226 |
TAGS,
|
|
|
|
| 28 |
lb = lb[lb["Tags"].apply(lambda x: any(tag in x for tag in tags))]
|
| 29 |
return lb
|
| 30 |
|
| 31 |
+
def change_mean(env, lb):
|
| 32 |
+
global f_b_df, f_a_df
|
| 33 |
+
lb = f_a_df.copy()
|
| 34 |
+
if env:
|
| 35 |
+
mean_cols = [col for col in lb.columns if str(col) not in ["Mean", "Environment", "Model", "Tags"]]
|
| 36 |
+
else:
|
| 37 |
+
mean_cols = [col for col in lb.columns if str(col) not in ["Mean", "Model", "Tags"]]
|
| 38 |
+
lb["Mean"] = lb[mean_cols].mean(axis=1)
|
| 39 |
+
return lb
|
| 40 |
|
| 41 |
def restart_space():
|
| 42 |
API.restart_space(repo_id=REPO_ID)
|
|
|
|
| 138 |
agg_df = agg_df.pivot(index="dataset", columns="benchmark_category", values="score")
|
| 139 |
agg_df.rename(columns={"OVERALL": "General"}, inplace=True)
|
| 140 |
agg_df.columns = [x.capitalize() for x in agg_df.columns]
|
| 141 |
+
mean_cols = [col for col in agg_df.columns if str(col) not in ["Mean", "Environment", "Model", "Tags"]]
|
| 142 |
+
agg_df["Mean"] = agg_df[mean_cols].mean(axis=1)
|
| 143 |
# make sure mean is the first column
|
| 144 |
agg_df = agg_df[["Mean"] + [col for col in agg_df.columns if col != "Mean"]]
|
| 145 |
for col in agg_df.columns:
|
|
|
|
| 222 |
with app:
|
| 223 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 224 |
with gr.TabItem("🏅 TTSDB Scores", elem_id="llm-benchmark-tab-table", id=0):
|
| 225 |
+
with gr.Group():
|
| 226 |
+
env = gr.Checkbox(value=True, label="Exclude environment from mean.")
|
| 227 |
+
gr.Markdown("**Environment** measures how well the system can reproduce noise in the training data. This doesn't correlate with human judgements for 'naturalness'")
|
| 228 |
tags = gr.Dropdown(
|
| 229 |
TAGS,
|
| 230 |
value=[],
|
|
|
|
| 234 |
)
|
| 235 |
leaderboard = init_leaderboard(f_a_df)
|
| 236 |
tags.change(filter_dfs, [tags, leaderboard], [leaderboard])
|
| 237 |
+
env.change(change_mean, [env, leaderboard], [leaderboard])
|
| 238 |
with gr.TabItem("🏅 Individual Benchmarks", elem_id="llm-benchmark-tab-table", id=1):
|
| 239 |
tags = gr.Dropdown(
|
| 240 |
TAGS,
|