|
|
import sys |
|
|
from pathlib import Path |
|
|
|
|
|
sys.path.append(str(Path(__file__).parent)) |
|
|
|
|
|
import pandas as pd |
|
|
import streamlit as st |
|
|
from streamlit.elements.lib.column_types import ColumnConfig |
|
|
|
|
|
from src.strings import ( |
|
|
CITATION_FEV, |
|
|
CITATION_HEADER, |
|
|
FEV_BENCHMARK_BASIC_INFO, |
|
|
FEV_BENCHMARK_DETAILS, |
|
|
PAIRWISE_BENCHMARK_DETAILS, |
|
|
get_pivot_legend, |
|
|
) |
|
|
from src.utils import ( |
|
|
COLORS, |
|
|
construct_pairwise_chart, |
|
|
format_leaderboard, |
|
|
format_metric_name, |
|
|
get_metric_description, |
|
|
) |
|
|
|
|
|
st.set_page_config(layout="wide", page_title="fev leaderboard", page_icon=":material/trophy:") |
|
|
|
|
|
TITLE = "<h1 style='text-align: center; font-size: 350%;'>fev-bench</h1>" |
|
|
SORT_COL = "win_rate" |
|
|
AVAILABLE_METRICS = ["SQL", "MASE", "WQL", "WAPE"] |
|
|
|
|
|
|
|
|
@st.cache_data() |
|
|
def get_leaderboard(metric_name: str) -> pd.DataFrame: |
|
|
return pd.read_csv(f"tables/leaderboard_{metric_name}.csv") |
|
|
|
|
|
|
|
|
@st.cache_data() |
|
|
def get_pairwise(metric_name: str) -> pd.DataFrame: |
|
|
return pd.read_csv(f"tables/pairwise_{metric_name}.csv") |
|
|
|
|
|
|
|
|
@st.cache_data() |
|
|
def get_pivot_table(metric_name: str) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: |
|
|
pivot_df = pd.read_csv(f"tables/pivot_{metric_name}.csv") |
|
|
baseline_imputed = pd.read_csv(f"tables/pivot_{metric_name}_baseline_imputed.csv") |
|
|
leakage_imputed = pd.read_csv(f"tables/pivot_{metric_name}_leakage_imputed.csv") |
|
|
return pivot_df, baseline_imputed, leakage_imputed |
|
|
|
|
|
|
|
|
with st.sidebar: |
|
|
selected_metric = st.selectbox("Evaluation Metric", options=AVAILABLE_METRICS, format_func=format_metric_name) |
|
|
st.caption(get_metric_description(selected_metric)) |
|
|
|
|
|
cols = st.columns(spec=[0.025, 0.95, 0.025]) |
|
|
|
|
|
with cols[1] as main_container: |
|
|
st.markdown(TITLE, unsafe_allow_html=True) |
|
|
|
|
|
metric_df = get_leaderboard(selected_metric).sort_values(by=SORT_COL, ascending=False) |
|
|
pairwise_df = get_pairwise(selected_metric) |
|
|
|
|
|
st.markdown("## :material/trophy: Leaderboard", unsafe_allow_html=True) |
|
|
st.markdown(FEV_BENCHMARK_BASIC_INFO, unsafe_allow_html=True) |
|
|
df_styled = format_leaderboard(metric_df) |
|
|
st.dataframe( |
|
|
df_styled, |
|
|
width="stretch", |
|
|
hide_index=True, |
|
|
column_config={ |
|
|
"model_name": ColumnConfig(label="Model Name", alignment="left"), |
|
|
"win_rate": st.column_config.NumberColumn(label="Avg. win rate (%)", format="%.1f"), |
|
|
"skill_score": st.column_config.NumberColumn(label="Skill score (%)", format="%.1f"), |
|
|
"median_inference_time_s": st.column_config.NumberColumn(label="Median runtime (s)", format="%.1f"), |
|
|
"training_corpus_overlap": st.column_config.NumberColumn(label="Leakage (%)", format="%d"), |
|
|
"num_failures": st.column_config.NumberColumn(label="Failed tasks (%)", format="%.0f"), |
|
|
"zero_shot": ColumnConfig(label="Zero-shot", alignment="center"), |
|
|
"org": ColumnConfig(label="Organization", alignment="left"), |
|
|
"link": st.column_config.LinkColumn(label="Link", display_text="π"), |
|
|
}, |
|
|
) |
|
|
|
|
|
with st.expander("See details"): |
|
|
st.markdown(FEV_BENCHMARK_DETAILS, unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("## :material/bar_chart: Pairwise comparison", unsafe_allow_html=True) |
|
|
chart_col_1, _, chart_col_2 = st.columns(spec=[0.45, 0.1, 0.45]) |
|
|
|
|
|
with chart_col_1: |
|
|
st.altair_chart( |
|
|
construct_pairwise_chart(pairwise_df, col="win_rate", metric_name=selected_metric), |
|
|
use_container_width=True, |
|
|
) |
|
|
|
|
|
with chart_col_2: |
|
|
st.altair_chart( |
|
|
construct_pairwise_chart(pairwise_df, col="skill_score", metric_name=selected_metric), |
|
|
use_container_width=True, |
|
|
) |
|
|
|
|
|
with st.expander("See details"): |
|
|
st.markdown(PAIRWISE_BENCHMARK_DETAILS, unsafe_allow_html=True) |
|
|
|
|
|
st.markdown("## :material/table_chart: Results for individual tasks", unsafe_allow_html=True) |
|
|
with st.expander("Show detailed results"): |
|
|
st.markdown(get_pivot_legend("Seasonal Naive", "Chronos-Bolt"), unsafe_allow_html=True) |
|
|
pivot_df, baseline_imputed, leakage_imputed = get_pivot_table(selected_metric) |
|
|
pivot_df = pivot_df.set_index("Task name") |
|
|
baseline_imputed = baseline_imputed.set_index("Task name") |
|
|
leakage_imputed = leakage_imputed.set_index("Task name") |
|
|
|
|
|
def style_pivot_table(errors, is_baseline_imputed, is_leakage_imputed): |
|
|
rank_colors = {1: COLORS["gold"], 2: COLORS["silver"], 3: COLORS["bronze"]} |
|
|
|
|
|
def highlight_by_position(styler): |
|
|
for row_idx in errors.index: |
|
|
row_ranks = errors.loc[row_idx].rank(method="min") |
|
|
for col_idx in errors.columns: |
|
|
rank = row_ranks[col_idx] |
|
|
style_parts = [] |
|
|
if rank <= 3: |
|
|
style_parts.append(f"background-color: {rank_colors[rank]}") |
|
|
if is_leakage_imputed.loc[row_idx, col_idx]: |
|
|
style_parts.append(f"color: {COLORS['leakage_impute']}") |
|
|
elif is_baseline_imputed.loc[row_idx, col_idx]: |
|
|
style_parts.append(f"color: {COLORS['failure_impute']}") |
|
|
elif not style_parts: |
|
|
style_parts.append(f"color: {COLORS['text_default']}") |
|
|
if style_parts: |
|
|
styler = styler.map( |
|
|
lambda x, s="; ".join(style_parts): s, |
|
|
subset=pd.IndexSlice[row_idx:row_idx, col_idx:col_idx], |
|
|
) |
|
|
return styler |
|
|
|
|
|
return highlight_by_position(errors.style).format(precision=3) |
|
|
|
|
|
st.dataframe(style_pivot_table(pivot_df, baseline_imputed, leakage_imputed)) |
|
|
|
|
|
st.divider() |
|
|
st.markdown("### :material/format_quote: Citation", unsafe_allow_html=True) |
|
|
st.markdown(CITATION_HEADER) |
|
|
st.markdown(CITATION_FEV) |
|
|
|