Spaces:
Runtime error
Runtime error
Yotam-Perlitz
commited on
Commit
·
b5e722a
1
Parent(s):
5c9c592
improve writings
Browse filesSigned-off-by: Yotam-Perlitz <y.perlitz@ibm.com>
app.py
CHANGED
|
@@ -51,16 +51,17 @@ st.divider()
|
|
| 51 |
|
| 52 |
st.markdown(
|
| 53 |
"""
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
BenchBench is for you if:
|
| 59 |
"""
|
| 60 |
)
|
| 61 |
|
| 62 |
st.markdown(
|
| 63 |
"""
|
|
|
|
|
|
|
| 64 |
- **You have a new benchmark**: Show that it agrees/disagrees with known benchmarks.
|
| 65 |
- **You are looking for a benchmark to run/trust**: Find an efficient/private/preferble alternative.
|
| 66 |
"""
|
|
@@ -68,11 +69,10 @@ st.markdown(
|
|
| 68 |
|
| 69 |
st.markdown(
|
| 70 |
"""
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
the benchmarks we compare to, and the models we use to compare with (see sidebar).
|
| 74 |
\n
|
| 75 |
-
|
| 76 |
"""
|
| 77 |
)
|
| 78 |
|
|
@@ -340,7 +340,8 @@ z_scores["date"] = z_scores["source"].apply(
|
|
| 340 |
|
| 341 |
z_scores["date"] = pd.to_datetime("20" + z_scores["date"]).dt.date
|
| 342 |
|
| 343 |
-
z_score_name = "
|
|
|
|
| 344 |
|
| 345 |
data = (
|
| 346 |
z_scores.rename(
|
|
@@ -348,7 +349,7 @@ data = (
|
|
| 348 |
"scenario": "Benchmark",
|
| 349 |
"z_score": z_score_name,
|
| 350 |
"corr_with_agg": corr_name,
|
| 351 |
-
"p_value_of_corr_with_agg":
|
| 352 |
# "n_models_of_corr_with_agg": "# Models Used",
|
| 353 |
"source": "Source",
|
| 354 |
"date": "Snapshot Date",
|
|
@@ -376,12 +377,12 @@ styled_data = (
|
|
| 376 |
)
|
| 377 |
.apply(highlight_uploaded_benchmark, axis=1)
|
| 378 |
.background_gradient(
|
| 379 |
-
subset=[
|
| 380 |
cmap="Reds",
|
| 381 |
vmin=0.1,
|
| 382 |
vmax=1,
|
| 383 |
)
|
| 384 |
-
.format(subset=[z_score_name, corr_name,
|
| 385 |
.set_properties(**{"text-align": "center"})
|
| 386 |
)
|
| 387 |
|
|
@@ -389,7 +390,7 @@ cols_used = [
|
|
| 389 |
"Benchmark",
|
| 390 |
z_score_name,
|
| 391 |
corr_name,
|
| 392 |
-
|
| 393 |
"Snapshot Date",
|
| 394 |
]
|
| 395 |
|
|
@@ -399,7 +400,7 @@ st.dataframe(
|
|
| 399 |
column_order=cols_used,
|
| 400 |
hide_index=True,
|
| 401 |
use_container_width=True,
|
| 402 |
-
height=
|
| 403 |
column_config={col: {"alignment": "center"} for col in cols_used},
|
| 404 |
)
|
| 405 |
|
|
@@ -420,9 +421,41 @@ with st.expander(label="Aggragate Benchmark scores"):
|
|
| 420 |
use_container_width=True,
|
| 421 |
)
|
| 422 |
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 426 |
|
| 427 |
@misc{berkeley-function-calling-leaderboard,
|
| 428 |
title={Berkeley Function Calling Leaderboard},
|
|
@@ -694,7 +727,7 @@ with st.expander(label="Citations"):
|
|
| 694 |
}
|
| 695 |
|
| 696 |
"""
|
| 697 |
-
|
| 698 |
|
| 699 |
|
| 700 |
st.subheader("Benchmark Report Card")
|
|
@@ -714,9 +747,9 @@ plotted_scenario = st.selectbox(
|
|
| 714 |
|
| 715 |
col1, col2, col3 = st.columns(3)
|
| 716 |
cur_data = data.query(f"Benchmark=='{plotted_scenario}'")
|
| 717 |
-
col1.metric("Relative agreement", cur_data[
|
| 718 |
col2.metric(corr_name, cur_data[corr_name])
|
| 719 |
-
col3.metric("p-value of Corr.", cur_data[
|
| 720 |
|
| 721 |
cur_df = allbench.df.query(f'scenario=="aggregate" or scenario=="{plotted_scenario}"')
|
| 722 |
|
|
@@ -837,3 +870,18 @@ st.image(
|
|
| 837 |
caption="**BenchBench's Standardized Approach Reduces Variance.** This ablation study demonstrates that following the best practices implemented in BenchBench significantly reduces the variance of BAT results, leading to more robust and reliable conclusions.",
|
| 838 |
use_column_width=True,
|
| 839 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
st.markdown(
|
| 53 |
"""
|
| 54 |
+
BenchBench rates benchmarks according to their agreement with the defined *Aggregate Benchmark* –
|
| 55 |
+
an enhanced representation of the benchmarks that are out there (see config in sidebar to modify).
|
| 56 |
+
|
| 57 |
+
|
|
|
|
| 58 |
"""
|
| 59 |
)
|
| 60 |
|
| 61 |
st.markdown(
|
| 62 |
"""
|
| 63 |
+
BenchBench is for you if:
|
| 64 |
+
\n
|
| 65 |
- **You have a new benchmark**: Show that it agrees/disagrees with known benchmarks.
|
| 66 |
- **You are looking for a benchmark to run/trust**: Find an efficient/private/preferble alternative.
|
| 67 |
"""
|
|
|
|
| 69 |
|
| 70 |
st.markdown(
|
| 71 |
"""
|
| 72 |
+
We also show that agreements are best represented with the the BenchBench Score,
|
| 73 |
+
the relative agreement (Z Score) of each benchmark to the Aggragate benchmark.
|
|
|
|
| 74 |
\n
|
| 75 |
+
Read more in our work [Benchmark Agreement Testing Done Right](https://arxiv.org/abs/2407.13696) and the [BenchBench repo](https://github.com/IBM/benchbench)
|
| 76 |
"""
|
| 77 |
)
|
| 78 |
|
|
|
|
| 340 |
|
| 341 |
z_scores["date"] = pd.to_datetime("20" + z_scores["date"]).dt.date
|
| 342 |
|
| 343 |
+
z_score_name = "BenchBench Score"
|
| 344 |
+
p_val_name = "p val"
|
| 345 |
|
| 346 |
data = (
|
| 347 |
z_scores.rename(
|
|
|
|
| 349 |
"scenario": "Benchmark",
|
| 350 |
"z_score": z_score_name,
|
| 351 |
"corr_with_agg": corr_name,
|
| 352 |
+
"p_value_of_corr_with_agg": p_val_name,
|
| 353 |
# "n_models_of_corr_with_agg": "# Models Used",
|
| 354 |
"source": "Source",
|
| 355 |
"date": "Snapshot Date",
|
|
|
|
| 377 |
)
|
| 378 |
.apply(highlight_uploaded_benchmark, axis=1)
|
| 379 |
.background_gradient(
|
| 380 |
+
subset=[p_val_name],
|
| 381 |
cmap="Reds",
|
| 382 |
vmin=0.1,
|
| 383 |
vmax=1,
|
| 384 |
)
|
| 385 |
+
.format(subset=[z_score_name, corr_name, p_val_name], formatter="{:.2}")
|
| 386 |
.set_properties(**{"text-align": "center"})
|
| 387 |
)
|
| 388 |
|
|
|
|
| 390 |
"Benchmark",
|
| 391 |
z_score_name,
|
| 392 |
corr_name,
|
| 393 |
+
p_val_name,
|
| 394 |
"Snapshot Date",
|
| 395 |
]
|
| 396 |
|
|
|
|
| 400 |
column_order=cols_used,
|
| 401 |
hide_index=True,
|
| 402 |
use_container_width=True,
|
| 403 |
+
height=300,
|
| 404 |
column_config={col: {"alignment": "center"} for col in cols_used},
|
| 405 |
)
|
| 406 |
|
|
|
|
| 421 |
use_container_width=True,
|
| 422 |
)
|
| 423 |
|
| 424 |
+
left, right = st.columns([1, 1])
|
| 425 |
+
|
| 426 |
+
with left:
|
| 427 |
+
with st.expander(label="Cite Us!"):
|
| 428 |
+
st.code(
|
| 429 |
+
r"""
|
| 430 |
+
|
| 431 |
+
@misc{perlitz2024llmbenchmarksagreefixing,
|
| 432 |
+
title={Do These LLM Benchmarks Agree? Fixing Benchmark Evaluation with BenchBench},
|
| 433 |
+
author={Yotam Perlitz and Ariel Gera and Ofir Arviv and Asaf Yehudai and Elron Bandel and Eyal Shnarch and Michal Shmueli-Scheuer and Leshem Choshen},
|
| 434 |
+
year={2024},
|
| 435 |
+
eprint={2407.13696},
|
| 436 |
+
archivePrefix={arXiv},
|
| 437 |
+
primaryClass={cs.CL},
|
| 438 |
+
url={https://arxiv.org/abs/2407.13696},
|
| 439 |
+
}
|
| 440 |
+
|
| 441 |
+
"""
|
| 442 |
+
)
|
| 443 |
+
|
| 444 |
+
with right:
|
| 445 |
+
with st.expander(label="Cite Everyone Else!"):
|
| 446 |
+
st.code(
|
| 447 |
+
r"""
|
| 448 |
+
|
| 449 |
+
@misc{perlitz2024llmbenchmarksagreefixing,
|
| 450 |
+
title={Do These LLM Benchmarks Agree? Fixing Benchmark Evaluation with BenchBench},
|
| 451 |
+
author={Yotam Perlitz and Ariel Gera and Ofir Arviv and Asaf Yehudai and Elron Bandel and Eyal Shnarch and Michal Shmueli-Scheuer and Leshem Choshen},
|
| 452 |
+
year={2024},
|
| 453 |
+
eprint={2407.13696},
|
| 454 |
+
archivePrefix={arXiv},
|
| 455 |
+
primaryClass={cs.CL},
|
| 456 |
+
url={https://arxiv.org/abs/2407.13696},
|
| 457 |
+
}
|
| 458 |
+
|
| 459 |
|
| 460 |
@misc{berkeley-function-calling-leaderboard,
|
| 461 |
title={Berkeley Function Calling Leaderboard},
|
|
|
|
| 727 |
}
|
| 728 |
|
| 729 |
"""
|
| 730 |
+
)
|
| 731 |
|
| 732 |
|
| 733 |
st.subheader("Benchmark Report Card")
|
|
|
|
| 747 |
|
| 748 |
col1, col2, col3 = st.columns(3)
|
| 749 |
cur_data = data.query(f"Benchmark=='{plotted_scenario}'")
|
| 750 |
+
col1.metric("Relative agreement", cur_data[z_score_name])
|
| 751 |
col2.metric(corr_name, cur_data[corr_name])
|
| 752 |
+
col3.metric("p-value of Corr.", cur_data[p_val_name])
|
| 753 |
|
| 754 |
cur_df = allbench.df.query(f'scenario=="aggregate" or scenario=="{plotted_scenario}"')
|
| 755 |
|
|
|
|
| 870 |
caption="**BenchBench's Standardized Approach Reduces Variance.** This ablation study demonstrates that following the best practices implemented in BenchBench significantly reduces the variance of BAT results, leading to more robust and reliable conclusions.",
|
| 871 |
use_column_width=True,
|
| 872 |
)
|
| 873 |
+
|
| 874 |
+
|
| 875 |
+
st.code(
|
| 876 |
+
r"""
|
| 877 |
+
@misc{perlitz2024llmbenchmarksagreefixing,
|
| 878 |
+
title={Do These LLM Benchmarks Agree? Fixing Benchmark Evaluation with BenchBench},
|
| 879 |
+
author={Yotam Perlitz and Ariel Gera and Ofir Arviv and Asaf Yehudai and Elron Bandel and Eyal Shnarch and Michal Shmueli-Scheuer and Leshem Choshen},
|
| 880 |
+
year={2024},
|
| 881 |
+
eprint={2407.13696},
|
| 882 |
+
archivePrefix={arXiv},
|
| 883 |
+
primaryClass={cs.CL},
|
| 884 |
+
url={https://arxiv.org/abs/2407.13696},
|
| 885 |
+
}
|
| 886 |
+
"""
|
| 887 |
+
)
|