Spaces:

dirkraft
/

fuhgedaboudit

Paused

App Files Files Community

Amber Tanaka commited on Aug 14

Commit

1e64d2b

unverified ·

1 Parent(s): b36c3c5

Table Legends Refactor (#57)

Browse files

Files changed (4) hide show

README.md +5 -2
app.py +1 -2
content.py +82 -0
ui_components.py +201 -59

README.md CHANGED Viewed

@@ -18,8 +18,11 @@ tags:
 The leaderboard is built using the [HuggingFace Datasets](https://huggingface.co/docs/datasets/index) library, which provides a convenient way to manage and query datasets.
 It's currently pointed at the [AstaBench Leaderboard](https://huggingface.co/datasets/allenai/asta-bench-internal-results/) dataset, which is a public dataset hosted on HuggingFace.
-To run the leaderboard locally, you can use the following command:
 ```bash
 python app.py
 ```

 The leaderboard is built using the [HuggingFace Datasets](https://huggingface.co/docs/datasets/index) library, which provides a convenient way to manage and query datasets.
 It's currently pointed at the [AstaBench Leaderboard](https://huggingface.co/datasets/allenai/asta-bench-internal-results/) dataset, which is a public dataset hosted on HuggingFace.
+To run the leaderboard locally first make sure to set this env variable:
+```bash
+export IS_INTERNAL=true
+```
+You can then start it up with the following command:
 ```bash
 python app.py
 ```

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 # app.py
 import gradio as gr
-import os
 import urllib.parse
 from apscheduler.schedulers.background import BackgroundScheduler
@@ -165,7 +164,7 @@ def restart_space_job():
 if __name__ == "__main__":
     if LOCAL_DEBUG:
         print("Launching in LOCAL_DEBUG mode.")
-        def get_initial_global_tag_choices(): return ["Overall", "TagA"]
         demo.launch(debug=True, allowed_paths=["assets"])
     else:
         print("Launching in Space mode.")

 # app.py
 import gradio as gr
 import urllib.parse
 from apscheduler.schedulers.background import BackgroundScheduler
 if __name__ == "__main__":
     if LOCAL_DEBUG:
         print("Launching in LOCAL_DEBUG mode.")
+        def get_initial_global_tag_choices(): return ["Overall"]
         demo.launch(debug=True, allowed_paths=["assets"])
     else:
         print("Launching in Space mode.")

content.py CHANGED Viewed

@@ -445,6 +445,88 @@ span.wrap[tabindex="0"][role="button"][data-editable="false"] {
 #main-header h2 {
     color: #f0529c;
 }
 #about-page-content-wrapper {
   margin-left: auto;
   margin-right: auto;

 #main-header h2 {
     color: #f0529c;
 }
+/* --- New HTML-Based Tooltip Styles --- */
+.tooltip-icon-legend {
+    position: relative;
+    cursor: help;
+    display: inline-block;
+}
+/* The HTML pop-up card.*/
+.tooltip-card {
+    /* Hiding mechanism */
+    opacity: 0;
+    visibility: hidden;
+    transition: opacity 0.2s;
+    pointer-events: none;
+    /* Card appearance */
+    position: absolute;
+    bottom: 125%;
+    left: 50%;
+    transform: translateX(-50%);
+    z-index: 1000;
+    background-color: #083c40;
+    color: #e5e7eb;
+    border-radius: 12px;
+    padding: 15px;
+    width: max-content;
+    max-width: 400px;
+    text-align: left;
+}
+.tooltip-icon-legend:hover .tooltip-card {
+    opacity: 1;
+    visibility: visible;
+}
+.tooltip-card h3 {
+    font-size: 18px;
+    color: #fff;
+    margin-top: 0;
+    margin-bottom: 12px;
+}
+.tooltip-card .tooltip-description {
+    margin-bottom: 20px;
+    line-height: 1.3;
+}
+.tooltip-card .tooltip-items-container {
+    display: flex;
+    flex-direction: column;
+    gap: 10px;
+}
+.tooltip-card .tooltip-legend-item {
+    display: flex;
+    align-items:
+    flex-start;
+    gap: 10px;
+}
+.tooltip-card .tooltip-legend-item img {
+    width: 20px;
+    height: 20px;
+    margin-top: 2px;
+}
+.tooltip-card .tooltip-legend-item div {
+    display: flex;
+    flex-direction: column;
+}
+.tooltip-card .tooltip-legend-item strong {
+    font-weight: 600;
+    color: #fff;
+}
+.tooltip-card .tooltip-legend-item span {
+    font-size: 13px;
+    line-height: 1.3;
+}
+.tooltip-sub-list {
+    list-style-type: '• ';
+    padding-left: 18px;
+    font-size: 13px;
+    line-height: 1.3;
+    display: flex;
+    flex-direction: column;
+/* About Page CSS */
 #about-page-content-wrapper {
   margin-left: auto;
   margin-right: auto;

ui_components.py CHANGED Viewed

@@ -41,36 +41,36 @@ os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
 # Global variables
 COMBINED_ICON_MAP = {
     "Open Source + Open Weights": {
-        "Standard": "assets/os-ow-standard.svg",        # Bright pink star
-        "Custom with Standard Search": "assets/os-ow-equivalent.svg",    # Bright pink diamond
-        "Custom": "assets/os-ow-custom.svg",            # Bright pink triangle
     },
     "Open Source": {
-        "Standard": "assets/os-standard.svg",        # Orange/pink star
-        "Custom with Standard Search": "assets/os-equivalent.svg",    # Orange/pink diamond
-        "Fully Custom": "assets/os-custom.svg",            # Orange/pink triangle
     },
     "API Available": {
-        "Standard": "assets/api-standard.svg",       # Yellow/pink star
-        "Custom with Standard Search": "assets/api-equivalent.svg",   # Yellow/pink diamond
-        "Fully Custom": "assets/api-custom.svg",           # Yellow/pink triangle
     },
     "Closed": {
-        "Standard": "assets/c-standard.svg",        # Hollow pink star
-        "Equivalent": "assets/c-equivalent.svg",    # Hollow pink diamond
-        "Fully Custom": "assets/c-custom.svg",            # Hollow pink triangle
     }
 }
 OPENNESS_SVG_MAP = {
-    "Open Source + Open Weights": "assets/os-ow-standard.svg",
-    "Open Source": "assets/os-standard.svg",
-    "API Available": "assets/api-standard.svg",
-    "Closed": "assets/c-standard.svg",
 }
 TOOLING_SVG_MAP = {
-    "Standard": "assets/os-ow-standard.svg",
-    "Custom with Standard Search": "assets/os-ow-equivalent.svg",
-    "Fully Custom": "assets/os-ow-custom.svg",
 }
 def get_svg_as_data_uri(path: str) -> str:
@@ -122,6 +122,126 @@ def create_svg_html(value, svg_map):
         return f'<img src="{src}" style="width: 16px; height: 16px; vertical-align: middle;" alt="{value}" title="{value}">'
     return ""
 # Dynamically generate the correct HTML for the legend parts
 openness_html = " ".join([create_svg_html(name, OPENNESS_SVG_MAP) for name in OPENNESS_SVG_MAP])
 tooling_html = " ".join([create_svg_html(name, TOOLING_SVG_MAP) for name in TOOLING_SVG_MAP])
@@ -150,47 +270,67 @@ for name, path in TOOLING_SVG_MAP.items():
     )
 tooling_html = " ".join(tooling_html_items)
-# Your final legend_markdown string (the structure of this does not change)
-legend_markdown = f"""
-<div style="display: flex; flex-wrap: wrap; align-items: flex-start; gap: 24px; font-size: 14px; padding-bottom: 8px;">
-    <div> <!-- Container for the Pareto section -->
-        <b>Pareto</b><span class="tooltip-icon" data-tooltip="Indicates if agent is on the Pareto frontier
-        ">ⓘ</span>
-        <div style="padding-top: 4px;"><span>🏆 On frontier</span></div>
-    </div>
-    <div> <!-- Container for the Openness section -->
-        <b>Agent Openness</b><span class="tooltip-icon" data-tooltip="•Closed: No API or code available
-        •API Available: API available, but no code
-        •Open Source: Code available, but no weights
-        •Open Source + Open Weights: Code and weights available
-        ">ⓘ</span>
-        <div style="display: flex; flex-wrap: wrap; align-items: center; gap: 16px; margin-top: 4px;">{openness_html}</div>
-    </div>
-    <div> <!-- Container for the Tooling section -->
-        <b>Agent Tooling</b><span class="tooltip-icon" data-tooltip="• Standard: Standard Approach used by the agent
-        • Custom with Standard Search: Standard search used by the agent
-        • Fully Custom: Fully custom tools used by the agent
-        ">ⓘ</span>
-        <div style="display: flex; flex-wrap: wrap; align-items: center; gap: 16px; margin-top: 4px;">{tooling_html}</div>
     </div>
-     <div><b>Column Descriptions</b><span class="tooltip-icon" data-tooltip="• Overall Score: Performance across all benchmarks
-        • Overall Cost: Cost per task in USD
-        • Literature Understanding Score: Performance on scientific literature tasks
-        • Literature Understanding Cost: Cost per literature understanding task in USD
-        • Data Analysis Score: Performance on data analysis tasks
-        • Code Execution Score: Performance on coding tasks
-        • Code Execution Cost: Cost per code execution task in USD
-        • Discovery Score: Performance on information discovery tasks
-        • Discovery Cost: Cost per discovery task in USD
-        • Categories Attempted: Number of benchmark categories the agent participated in
-        • Logs: Link to detailed evaluation logs">ⓘ</span></div>
-</div>
-"""
 # --- Global State for Viewers (simple caching) ---
 CACHED_VIEWERS = {}
@@ -330,7 +470,6 @@ def create_leaderboard_display(
     gr.HTML(value=scatter_disclaimer_html, elem_id="scatter-disclaimer")
     # Put table and key into an accordion
     with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
-        gr.HTML(value=legend_markdown, elem_id="legend-markdown")
         dataframe_component = gr.DataFrame(
             headers=df_headers,
             value=df_view,
@@ -340,6 +479,8 @@ def create_leaderboard_display(
             column_widths=[40, 40, 200, 200],
             elem_classes=["wrap-header-df"]
         )
     # Return the components so they can be referenced elsewhere.
     return plot_component, dataframe_component
@@ -487,7 +628,6 @@ def create_benchmark_details_display(
         gr.HTML(value=scatter_disclaimer_html, elem_id="scatter-disclaimer")
         # Put table and key into an accordion
         with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
-            gr.HTML(value=legend_markdown, elem_id="legend-markdown")
             gr.DataFrame(
                 headers=df_headers,
                 value=benchmark_table_df,
@@ -497,6 +637,8 @@ def create_benchmark_details_display(
                 column_widths=[40, 40, 200, 150, 175, 85],
                 elem_classes=["wrap-header-df"]
             )
 def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
     """

 # Global variables
 COMBINED_ICON_MAP = {
     "Open Source + Open Weights": {
+        "Standard": "assets/os-ow-standard.svg",
+        "Custom with Standard Search": "assets/os-ow-equivalent.svg",
+        "Custom": "assets/os-ow-custom.svg",
     },
     "Open Source": {
+        "Standard": "assets/os-standard.svg",
+        "Custom with Standard Search": "assets/os-equivalent.svg",
+        "Fully Custom": "assets/os-custom.svg",
     },
     "API Available": {
+        "Standard": "assets/api-standard.svg",
+        "Custom with Standard Search": "assets/api-equivalent.svg",
+        "Fully Custom": "assets/api-custom.svg",
     },
     "Closed": {
+        "Standard": "assets/c-standard.svg",
+        "Equivalent": "assets/c-equivalent.svg",
+        "Fully Custom": "assets/c-custom.svg",
     }
 }
 OPENNESS_SVG_MAP = {
+    "Open Source + Open Weights": "assets/os-ow-legend.svg",
+    "Open Source": "assets/os-legend.svg",
+    "API Available": "assets/api-legend.svg",
+    "Closed": "assets/c-legend.svg",
 }
 TOOLING_SVG_MAP = {
+    "Standard": "assets/standard-legend.svg",
+    "Custom with Standard Search": "assets/equivalent-legend.svg",
+    "Fully Custom": "assets/custom-legend.svg",
 }
 def get_svg_as_data_uri(path: str) -> str:
         return f'<img src="{src}" style="width: 16px; height: 16px; vertical-align: middle;" alt="{value}" title="{value}">'
     return ""
+def build_openness_tooltip_content() -> str:
+    """
+    Generates the inner HTML for the Agent Openness tooltip card,
+    """
+    descriptions = {
+        "Open Source + Open Weights": "Both code and ML models are open",
+        "Open Source": "Code is open but uses an ML model with closed-weights",
+        "API Available": "No access to code; API access only",
+        "Closed": "No access to code or API; UI  access only",
+    }
+    html_items = []
+    for name, path in OPENNESS_SVG_MAP.items():
+        uri = get_svg_as_data_uri(path)
+        desc = descriptions.get(name, "")
+        # Create the HTML for a single row in the tooltip legend
+        html_items.append(f"""
+            <div class="tooltip-legend-item">
+                <img src="{uri}" alt="{name}">
+                <div>
+                    <strong>{name}</strong>
+                    <span>{desc}</span>
+                </div>
+            </div>
+        """)
+    return "".join(html_items)
+def build_pareto_tooltip_content() -> str:
+    """Generates the inner HTML for the Pareto tooltip card with final copy."""
+    return f"""
+        <h3>On Pareto Frontier</h3>
+        <p class="tooltip-description">The Pareto frontier represents the best balance between score and cost.</p>
+        <p class="tooltip-description">Agents on the frontier either:</p>
+        <ul class="tooltip-sub-list">
+            <li>Offer the lowest cost for a given performance, or</li>
+            <li>Deliver the best performance at a given cost.</li>
+        </ul>
+        <p class="tooltip-description" style="margin-top: 12px;">These agents are marked with this icon: 🏆</p>
+    """
+def build_tooling_tooltip_content() -> str:
+    """Generates the inner HTML for the Agent Tooling tooltip card."""
+    descriptions = {
+        "Standard": "Uses only predefined tools from the evaluation environment (as defined in Inspect's state.tools).",
+        "Custom with Standard Search": "Custom tools for accessing an equivalent underlying environment:",
+        "Fully Custom": "Uses tools beyond constraints of Standard or Custom interface",
+    }
+    custom_interface_sub_list = """
+        <ul class="tooltip-sub-list">
+            <li>Literature tasks: Information access is limited to date-restricted usage of the Asta MCP tools.</li>
+            <li>Code tasks: Code execution is limited to an iPython shell in a machine environment initialized with the standard Asta sandbox Dockerfile (or equivalent).</li>
+        </ul>
+    """
+    html_items = []
+    for name, path in TOOLING_SVG_MAP.items():
+        uri = get_svg_as_data_uri(path)
+        desc = descriptions.get(name, "")
+        # Check if this is the special case that needs a sub-list
+        sub_list_html = custom_interface_sub_list if name == "Custom with Standard Search" else ""
+        html_items.append(f"""
+            <div class="tooltip-legend-item">
+                <img src="{uri}" alt="{name}">
+                <div>
+                    <strong>{name}</strong>
+                    <span>{desc}</span>
+                    {sub_list_html}
+                </div>
+            </div>
+        """)
+    return "".join(html_items)
+def build_descriptions_tooltip_content(table) -> str:
+    """Generates the inner HTML for the Column Descriptions tooltip card depending on which kind of table."""
+    if table == "Overall":
+        return """
+            <div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
+            <div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
+            <div class="tooltip-description-item"><b>LLM Base:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div>
+            <div class="tooltip-description-item"><b>Overall Score:</b> Macro-average of the four category-level average scores. Each category contributes equally.</div>
+            <div class="tooltip-description-item"><b>Overall Cost:</b> Macro-average cost per problem across all categories, in USD. Based on submission-time values. Each category contributes equally</div>
+            <div class="tooltip-description-item"><b>Literature Understanding Score:</b> Macro-average score across Literature Understanding benchmarks.</div>
+            <div class="tooltip-description-item"><b>Literature Understanding Cost:</b> Macro-average cost per problem (USD) across Literature Understanding benchmarks.</div>
+            <div class="tooltip-description-item"><b>Code Execution Score:</b> Macro-average score across Code & Execution benchmarks.</div>
+            <div class="tooltip-description-item"><b>Code Execution Cost:</b> Macro-average cost per problem (USD) across Code & Execution benchmarks.</div>
+            <div class="tooltip-description-item"><b>Data Analysis Score:</b> Macro-average score across Data Analysis benchmarks.</div>
+            <div class="tooltip-description-item"><b>Data Analysis Cost:</b> Macro-average cost per problem (USD) across Data Analysis benchmarks.</div>
+            <div class="tooltip-description-item"><b>End-to-End Discovery Score:</b> Macro-average score across End-to-End Discovery benchmarks.</div>
+            <div class="tooltip-description-item"><b>End-to-End Discovery Cost:</b> Macro-average cost per problem (USD)across End-to-End Discovery benchmarks.</div>
+            <div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 4).</div>
+            <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
+        """
+    elif table in ["Literature Understanding", "Code & Execution", "Data Analysis", "End-to-End Discovery"]:
+        return f"""
+            <div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
+            <div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
+            <div class="tooltip-description-item"><b>LLM Base:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div>
+            <div class="tooltip-description-item"><b>{table} Score:</b> Macro-average score across {table} benchmarks.</div>
+            <div class="tooltip-description-item"><b>{table} Cost:</b> Macro-average cost per problem (USD) across {table} benchmarks.</div>
+            <div class="tooltip-description-item"><b>Benchmark Score:</b> Average (mean) score on the benchmark.</div>
+            <div class="tooltip-description-item"><b>Benchmark Cost:</b> Average (mean) cost per problem (USD) on the benchmark.</div>
+            <div class="tooltip-description-item"><b>Benchmarks Attempted:</b> Number of benchmarks attempted in this category (e.g., 3/5).</div>
+            <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
+        """
+    else:
+        # Fallback for any other table type, e.g., individual benchmarks
+        return f"""
+            <div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
+            <div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
+            <div class="tooltip-description-item"><b>LLM Base:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div>
+            <div class="tooltip-description-item"><b>Benchmark Attempted:</b> Indicates whether the agent attempted this benchmark.</div>
+            <div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
+            <div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div>
+            <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
+        """
 # Dynamically generate the correct HTML for the legend parts
 openness_html = " ".join([create_svg_html(name, OPENNESS_SVG_MAP) for name in OPENNESS_SVG_MAP])
 tooling_html = " ".join([create_svg_html(name, TOOLING_SVG_MAP) for name in TOOLING_SVG_MAP])
     )
 tooling_html = " ".join(tooling_html_items)
+pareto_tooltip_content = build_pareto_tooltip_content()
+openness_tooltip_content = build_openness_tooltip_content()
+tooling_tooltip_content = build_tooling_tooltip_content()
+def create_legend_markdown(which_table: str) -> str:
+    """
+    Generates the complete HTML for the legend section, including tooltips.
+    This is used in the main leaderboard display.
+    """
+    descriptions_tooltip_content = build_descriptions_tooltip_content(which_table)
+    legend_markdown = f"""
+    <div style="display: flex; flex-wrap: wrap; align-items: flex-start; gap: 10px; font-size: 14px; padding-bottom: 8px;">
+        <div> <!-- Container for the Pareto section -->
+            <b>Pareto</b>
+            <span class="tooltip-icon-legend">
+                ⓘ
+                <span class="tooltip-card">{pareto_tooltip_content}</span>
+            </span>
+            <div style="margin-top: 8px;"><span>🏆 On frontier</span></div>
+        </div>
+        <div> <!-- Container for the Openness section -->
+            <b>Agent Openness</b>
+            <span class="tooltip-icon-legend">
+                ⓘ
+                <span class="tooltip-card">
+                    <h3>Agent Openness</h3>
+                    <p class="tooltip-description">Indicates how transparent and reproducible an agent is.</p>
+                    <div class="tooltip-items-container">{openness_tooltip_content}</div>
+                </span>
+            </span>
+            <div style="display: flex; flex-wrap: wrap; align-items: center; gap: 16px; margin-top: 8px;">{openness_html}</div>
+        </div>
+        <div> <!-- Container for the Tooling section -->
+            <b>Agent Tooling</b>
+            <span class="tooltip-icon-legend">
+                ⓘ
+                <span class="tooltip-card">
+                    <h3>Agent Tooling</h3>
+                    <p class="tooltip-description">Describes the tool usage and execution environment of the agent during evaluation.</p>
+                    <div class="tooltip-items-container">{tooling_tooltip_content}</div>
+                </span>
+            </span>
+            <div style="display: flex; flex-wrap: wrap; align-items: center; gap: 16px; margin-top: 8px;">{tooling_html}</div>
+        </div>
+        <div><!-- Container for the Column Descriptions section -->
+            <b>Column Descriptions</b>
+            <span class="tooltip-icon-legend">
+                ⓘ
+                <span class="tooltip-card">
+                    <h3>Column Descriptions</h3>
+                    <div class="tooltip-items-container">{descriptions_tooltip_content}</div>
+                </span>
+            </span>
+        </div>
     </div>
+    """
+    return legend_markdown
 # --- Global State for Viewers (simple caching) ---
 CACHED_VIEWERS = {}
     gr.HTML(value=scatter_disclaimer_html, elem_id="scatter-disclaimer")
     # Put table and key into an accordion
     with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
         dataframe_component = gr.DataFrame(
             headers=df_headers,
             value=df_view,
             column_widths=[40, 40, 200, 200],
             elem_classes=["wrap-header-df"]
         )
+        legend_markdown = create_legend_markdown(category_name)
+        gr.HTML(value=legend_markdown, elem_id="legend-markdown")
     # Return the components so they can be referenced elsewhere.
     return plot_component, dataframe_component
         gr.HTML(value=scatter_disclaimer_html, elem_id="scatter-disclaimer")
         # Put table and key into an accordion
         with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
             gr.DataFrame(
                 headers=df_headers,
                 value=benchmark_table_df,
                 column_widths=[40, 40, 200, 150, 175, 85],
                 elem_classes=["wrap-header-df"]
             )
+            legend_markdown = create_legend_markdown(benchmark_name)
+            gr.HTML(value=legend_markdown, elem_id="legend-markdown")
 def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
     """