lingoly-too

Running

App Files Files Community

Jude Khouja commited on Mar 3

Commit

36ce9ab

0 Parent(s):

1st clean draft

Browse files

Files changed (13) hide show

.gitattributes +35 -0
.gitignore +175 -0
README.md +12 -0
app.py +41 -0
assets/OII_logo.png +0 -0
chat.py +1150 -0
data_loader.py +457 -0
leaderboard.csv +12 -0
requirements.txt +4 -0
results.csv +22 -0
tabs/leaderboard.py +160 -0
utils.py +107 -0
visualization.py +247 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,175 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# PyPI configuration file
+.pypirc
+data/
+.DS_Store
+get_results.ipynb

README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+title: LingOly-TOO benchmark
+emoji: 💬
+colorFrom: yellow
+colorTo: pink
+sdk: gradio
+sdk_version: 5.0.1
+app_file: app.py
+pinned: false
+license: apache-2.0
+short_description: Reasoning benchmark in linguistics
+---

app.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# Add this at the top of your script
+import warnings
+warnings.filterwarnings("ignore")
+import gradio as gr
+from data_loader import (
+    METHODOLOGY,
+    load_data,
+    HEADER_CONTENT,
+    CARDS,
+)
+from tabs.leaderboard import create_leaderboard_tab, filter_leaderboard
+def create_app():
+    df = load_data()
+    with gr.Blocks(
+        theme=gr.themes.Soft(font=[gr.themes.GoogleFont("sans-serif")])
+    ) as app:
+        # Create tabs
+        lb_output = create_leaderboard_tab(
+            df, HEADER_CONTENT, CARDS
+        )
+        # Initial load
+        app.load(
+            fn=lambda: filter_leaderboard(
+                df, "Score after obfuscation"
+            ),
+            outputs=[lb_output],
+        )
+        gr.HTML(METHODOLOGY)
+    return app
+demo = create_app()
+demo.launch()

assets/OII_logo.png ADDED Viewed

chat.py ADDED Viewed

	@@ -0,0 +1,1150 @@

+import json
+def format_user_message(msg):
+    """Format a user message for display."""
+    # Extract the content based on role
+    content = msg.get("content", "")
+    # Handle None content
+    if content is None:
+        content = ""
+    elif isinstance(content, (int, float)):
+        content = str(content)
+    elif isinstance(content, list):
+        # Handle list-type content (may contain multiple parts)
+        content_text = ""
+        for item in content:
+            if item is None:
+                continue
+            if isinstance(item, dict) and "text" in item:
+                text_value = item.get("text", "")
+                if text_value is not None:
+                    content_text += str(text_value) + "\n"
+            elif isinstance(item, str):
+                content_text += item + "\n"
+            elif item is not None:
+                content_text += str(item) + "\n"
+        content = content_text.strip()
+    # User message - align right using text-align instead of flex
+    return f"""
+    <div style="
+        text-align: right;
+        margin-bottom: 1.25rem;
+        padding: 0 0.5rem;">
+        <div style="
+            display: inline-block;
+            max-width: 85%;
+            background-color: var(--message-bg-user);
+            padding: 1rem;
+            border-radius: 1rem 0 1rem 1rem;
+            color: var(--text-color);
+            text-align: left;
+            box-shadow: 0 1px 2px var(--shadow-color);">
+            <div style="
+                font-weight: 500;
+                margin-bottom: 0.5rem;
+                color: var(--primary-text);
+                display: flex;
+                align-items: center;">
+                <span style="margin-right: 0.5rem;">👤</span>User
+            </div>
+            <div style="white-space: pre-wrap; line-height: 1.5;">
+                {content}
+            </div>
+        </div>
+    </div>
+    """
+def format_tool_call(tool_name, tool_input):
+    """Format a tool call for display."""
+    # Ensure tool_name is a string
+    if tool_name is None:
+        tool_name = "Unknown Tool"
+    elif not isinstance(tool_name, str):
+        tool_name = str(tool_name)
+    # Ensure tool_input is serializable
+    if tool_input is None:
+        tool_input = {}
+    try:
+        # Try to serialize the tool input as JSON
+        tool_input_json = json.dumps(tool_input, indent=2)
+    except TypeError:
+        # If serialization fails, create a simplified representation
+        if isinstance(tool_input, dict):
+            simplified_input = {}
+            for k, v in tool_input.items():
+                if v is None or isinstance(v, (str, int, float, bool, list, dict)):
+                    simplified_input[k] = v
+                else:
+                    simplified_input[k] = str(v)
+            tool_input_json = json.dumps(simplified_input, indent=2)
+        else:
+            tool_input_json = str(tool_input)
+    return f"""
+    <div style="
+        background-color: var(--surface-color-alt);
+        padding: 0.75rem;
+        border-radius: 0.5rem;
+        margin-top: 0.75rem;
+        border-left: 3px solid var(--primary-text-light);">
+        <div style="
+            font-weight: 500;
+            margin-bottom: 0.5rem;
+            font-size: 0.9rem;
+            color: var(--primary-text);">
+            <span style="margin-right: 0.5rem;">🔧</span>{tool_name}
+        </div>
+        <div style="
+            font-family: monospace;
+            font-size: 0.85rem;
+            white-space: pre-wrap;">
+            {tool_input_json}
+        </div>
+    </div>
+    """
+def extract_assistant_content(msg):
+    """Extract text content and tool calls from an assistant message."""
+    assistant_text = ""
+    tool_calls_html = ""
+    if "content" in msg:
+        content = msg["content"]
+        # Handle string content
+        if content is None:
+            assistant_text = ""
+        elif isinstance(content, str):
+            assistant_text = content
+        elif isinstance(content, (int, float)):
+            assistant_text = str(content)
+        # Handle list content with text and tool calls
+        elif isinstance(content, list):
+            for item in content:
+                if item is None:
+                    continue
+                if isinstance(item, dict):
+                    if "text" in item:
+                        text_value = item.get("text", "")
+                        if text_value is not None:
+                            assistant_text += str(text_value) + "\n"
+                    elif "type" in item and item["type"] == "tool_use":
+                        # Format tool call in a nicer way
+                        tool_name = item.get("name", "Unknown Tool")
+                        tool_input = item.get("input", {})
+                        if tool_input is None:
+                            tool_input = {}
+                        tool_calls_html += format_tool_call(tool_name, tool_input)
+                elif isinstance(item, str):
+                    assistant_text += item + "\n"
+                elif item is not None:
+                    assistant_text += str(item) + "\n"
+    # Extract tool calls if present
+    elif "tool_calls" in msg:
+        assistant_text = "The assistant used the following tools:"
+        tool_calls = msg.get("tool_calls", [])
+        if tool_calls is None:
+            tool_calls = []
+        for tool_call in tool_calls:
+            if tool_call is None:
+                continue
+            tool_name = tool_call.get("name", "Unknown Tool")
+            tool_args = tool_call.get("args", {})
+            if tool_args is None:
+                tool_args = {}
+            tool_calls_html += format_tool_call(tool_name, tool_args)
+    return assistant_text.strip(), tool_calls_html
+def format_assistant_message(msg):
+    """Format an assistant message for display."""
+    assistant_text, tool_calls_html = extract_assistant_content(msg)
+    return f"""
+    <div style="
+        text-align: left;
+        margin-bottom: 1.25rem;
+        padding: 0 0.5rem;">
+        <div style="
+            display: inline-block;
+            max-width: 85%;
+            background-color: var(--message-bg-assistant);
+            padding: 1rem;
+            border-radius: 0 1rem 1rem 1rem;
+            color: var(--text-color);
+            text-align: left;
+            box-shadow: 0 1px 2px var(--shadow-color);">
+            <div style="
+                font-weight: 500;
+                margin-bottom: 0.5rem;
+                color: var(--primary-text);
+                display: flex;
+                align-items: center;">
+                <span style="margin-right: 0.5rem;">🤖</span>Assistant
+            </div>
+            <div style="white-space: pre-wrap; line-height: 1.5;">
+                {assistant_text}
+            </div>
+            {tool_calls_html}
+        </div>
+    </div>
+    """
+def format_system_message(msg):
+    """Format a system or other message for display."""
+    content = msg.get("content", "")
+    # Handle None content
+    if content is None:
+        content = ""
+    elif isinstance(content, (int, float)):
+        content = str(content)
+    elif isinstance(content, list):
+        content_text = ""
+        for item in content:
+            if item is None:
+                continue
+            if isinstance(item, dict) and "text" in item:
+                text_value = item.get("text", "")
+                if text_value is not None:
+                    content_text += str(text_value) + "\n"
+            elif isinstance(item, str):
+                content_text += item + "\n"
+            elif item is not None:
+                content_text += str(item) + "\n"
+        content = content_text.strip()
+    return f"""
+    <div style="
+        text-align: center;
+        margin-bottom: 1rem;
+        padding: 0 0.5rem;">
+        <div style="
+            display: inline-block;
+            max-width: 85%;
+            background-color: var(--message-bg-system);
+            padding: 0.75rem;
+            border-radius: 0.5rem;
+            color: var(--text-color);
+            text-align: left;
+            font-style: italic;
+            font-size: 0.9rem;">
+            {content}
+        </div>
+    </div>
+    """
+def parse_complex_response(response):
+    """Parse complex JSON response and extract text and tool calls."""
+    try:
+        # Ensure response is a string
+        if response is None:
+            return "", ""
+        if isinstance(response, (int, float)):
+            return str(response), ""
+        # Convert to string if it's not already
+        if not isinstance(response, str):
+            response = str(response)
+        # Try to parse as JSON
+        if not response.strip().startswith("[") and not response.strip().startswith(
+            "{"
+        ):
+            return response, ""
+        response_obj = json.loads(response)
+        # Handle array format like in the example
+        if isinstance(response_obj, list) and len(response_obj) > 0:
+            response_obj = response_obj[0]  # Take first item in array
+        # Extract text content and tool calls
+        text_content = ""
+        tool_calls_html = ""
+        # Handle content field which can be string or list
+        if "content" in response_obj:
+            content = response_obj["content"]
+            if content is None:
+                text_content = ""
+            elif isinstance(content, str):
+                text_content = content
+            elif isinstance(content, (int, float)):
+                text_content = str(content)
+            elif isinstance(content, list):
+                # Extract only text content from items with type="text"
+                for item in content:
+                    if item is None:
+                        continue
+                    if isinstance(item, dict):
+                        if "type" in item and item["type"] == "text" and "text" in item:
+                            text_value = item.get("text", "")
+                            if text_value is not None:
+                                text_content += str(text_value) + "\n"
+        # Get formatted tool calls if they exist
+        if "tool_calls" in response_obj:
+            tool_calls = response_obj.get("tool_calls", [])
+            if tool_calls is None:
+                tool_calls = []
+            if tool_calls:
+                try:
+                    tool_calls_html = f"""
+                    <div style="
+                        background-color: var(--surface-color-alt);
+                        padding: 0.75rem;
+                        border-radius: 0.5rem;
+                        margin-top: 0.75rem;
+                        border-left: 3px solid var(--primary-text-light);">
+                        <div style="
+                            font-weight: 500;
+                            margin-bottom: 0.5rem;
+                            font-size: 0.9rem;
+                            color: var(--primary-text);">
+                            <span style="margin-right: 0.5rem;">🔧</span>Tool Calls
+                        </div>
+                        <div style="
+                            font-family: monospace;
+                            font-size: 0.85rem;
+                            white-space: pre-wrap;">
+                            {json.dumps(tool_calls, indent=2)}
+                        </div>
+                    </div>
+                    """
+                except:
+                    # Fallback if JSON serialization fails
+                    tool_calls_html = (
+                        "<div>Tool calls present but could not be formatted.</div>"
+                    )
+        return text_content.strip(), tool_calls_html
+    except Exception as e:
+        # If parsing fails, return the original response with error info
+        return f"{response}\n\nError parsing response: {str(e)}", ""
+def format_final_response(response):
+    """Format the final response for display."""
+    # First try to process as complex JSON with tool calls
+    text_content, tool_calls_html = parse_complex_response(response)
+    # If that didn't work, try basic JSON parsing
+    if text_content == response:
+        # Clean up JSON response if it looks like JSON
+        if response.strip().startswith("{") and "content" in response:
+            try:
+                response_obj = json.loads(response)
+                if isinstance(response_obj, dict) and "content" in response_obj:
+                    if isinstance(response_obj["content"], str):
+                        text_content = response_obj["content"]
+                    else:
+                        text_content = json.dumps(response_obj["content"], indent=2)
+                else:
+                    text_content = response
+            except:
+                text_content = response
+        else:
+            text_content = response
+    return f"""
+    <div style="
+        text-align: left;
+        margin-bottom: 1.25rem;
+        margin-top: 1.5rem;
+        padding: 0 0.5rem;">
+        <div style="
+            display: inline-block;
+            max-width: 85%;
+            background-color: var(--response-bg);
+            padding: 1rem;
+            border-radius: 0 1rem 1rem 1rem;
+            color: var(--text-color);
+            text-align: left;
+            box-shadow: 0 1px 2px var(--shadow-color);
+            border-left: 4px solid var(--primary-text);">
+            <div style="
+                font-weight: 500;
+                margin-bottom: 0.5rem;
+                color: var(--primary-text);
+                display: flex;
+                align-items: center;">
+                <span style="margin-right: 0.5rem;">🤖</span>Final Response
+            </div>
+            <div style="
+                white-space: pre-wrap;
+                line-height: 1.5;
+                font-family: var(--font-sans);">
+                {text_content}
+            </div>
+            {tool_calls_html}
+        </div>
+    </div>
+    """
+def update_chat_display(existing_display, new_message):
+    """Update an existing chat display with a new message."""
+    try:
+        # Parse the new message
+        role = new_message.get("role", "unknown").lower()
+        # Format the new message based on its role
+        if role == "user":
+            message_html = format_user_message(new_message)
+        elif role == "assistant" or role == "ai":
+            message_html = format_assistant_message(new_message)
+        else:
+            message_html = format_system_message(new_message)
+        # Find the position to insert the new message (before the Final Response section)
+        insert_marker = '<div style="padding-top: 0.5rem;margin-top: 1rem;margin-bottom: 1rem;border-top: 1px solid var(--border-color-light);'
+        parts = existing_display.split(insert_marker)
+        if len(parts) == 2:
+            # Insert the new message before the Final Response section
+            updated_display = parts[0] + message_html + insert_marker + parts[1]
+            return updated_display
+        else:
+            # If we can't find the insertion point, append to the end
+            return existing_display + message_html
+    except Exception as e:
+        return (
+            existing_display
+            + f"""
+        <div style="
+            padding: 1rem;
+            color: var(--score-low);
+            background-color: var(--surface-color);
+            border: 1px solid var(--score-low);
+            border-radius: 10px;
+            margin-top: 1rem;">
+            <div style="font-weight: 600; margin-bottom: 0.5rem;">Error Updating Chat</div>
+            <div style="font-family: monospace; white-space: pre-wrap;">{str(e)}</div>
+        </div>
+        """
+        )
+def format_chat_display(row):
+    """Format the chat display with better styling for user and assistant messages."""
+    try:
+        # Parse the conversation JSON
+        messages = json.loads(row["conversation"])
+        # Create HTML for all messages
+        messages_html = ""
+        for msg in messages:
+            role = msg.get("role", "unknown").lower()
+            if role == "user":
+                messages_html += format_user_message(msg)
+            elif role == "assistant" or role == "ai":
+                messages_html += format_assistant_message(msg)
+            else:
+                # System or other message types
+                messages_html += format_system_message(msg)
+        # Format the final response from the assistant
+        response_html = format_final_response(row["response"])
+        # Combine all HTML
+        full_chat_html = f"""
+        <div style="
+            padding: 1.5rem;
+            background-color: var(--surface-color);
+            border-radius: 10px;
+            border: 1px solid var(--border-color);
+            box-shadow: 0 2px 6px var(--shadow-color);
+            height: 100%;
+            overflow-y: auto;
+            max-height: 600px;
+            font-family: var(--font-sans);">
+            <div style="
+                padding-bottom: 1rem;
+                margin-bottom: 1.5rem;
+                border-bottom: 1px solid var(--border-color-light);
+                display: flex;
+                align-items: center;">
+                <div style="
+                    font-weight: 600;
+                    font-size: 1.1rem;
+                    color: var(--primary-text);">
+                    <span style="margin-right: 0.5rem;">💬</span>Conversation
+                </div>
+            </div>
+            {messages_html}
+            {response_html}
+        </div>
+        """
+        return full_chat_html
+    except Exception as e:
+        return f"""
+        <div style="
+            padding: 1.5rem;
+            color: var(--score-low);
+            background-color: var(--surface-color);
+            border: 1px solid var(--score-low);
+            border-radius: 10px;">
+            <div style="font-weight: 600; margin-bottom: 0.5rem;">Error Formatting Chat</div>
+            <div style="font-family: monospace; white-space: pre-wrap;">{str(e)}</div>
+            <div style="margin-top: 1rem; font-family: monospace; font-size: 0.8rem;">
+                Original conversation: {str(row["conversation"])}
+            </div>
+        </div>
+        """
+def parse_tool_schema(tool):
+    """Parse tool schema to extract name, description, and parameters properly."""
+    # Handle schema wrapped in a list
+    if isinstance(tool, list) and len(tool) > 0:
+        tool = tool[0]
+    # Extract function information from the new schema structure with "function" key
+    if "function" in tool:
+        function_data = tool["function"]
+        name = function_data.get("name", "Unnamed Tool")
+        description = function_data.get("description", "No description available")
+        parameters = {}
+        if (
+            "parameters" in function_data
+            and "properties" in function_data["parameters"]
+        ):
+            properties = function_data["parameters"]["properties"]
+            for param_name, param_data in properties.items():
+                param_desc = param_data.get("description", "No description")
+                param_type = param_data.get("type", "unknown")
+                param_default = param_data.get("default", "None")
+                # Include default value in parameter description
+                parameters[param_name] = (
+                    f"{param_desc} (Type: {param_type}, Default: {param_default})"
+                )
+        # Check for required parameters
+        required_params = function_data.get("parameters", {}).get("required", [])
+        if required_params:
+            for param_name in required_params:
+                if param_name in parameters:
+                    parameters[param_name] = f"[REQUIRED] {parameters[param_name]}"
+    else:
+        # Original schema parsing
+        name = tool.get("title", "Unnamed Tool")
+        description = tool.get("description", "No description available")
+        parameters = {}
+        if "properties" in tool:
+            for param_name, param_data in tool["properties"].items():
+                param_desc = param_data.get("description", "No description")
+                param_type = param_data.get("type", "unknown")
+                param_title = param_data.get("title", param_name)
+                parameters[param_name] = (
+                    f"{param_desc} (Type: {param_type}, Title: {param_title})"
+                )
+        # Check for required parameters in the original schema
+        required_params = tool.get("required", [])
+        if required_params:
+            for param_name in required_params:
+                if param_name in parameters:
+                    parameters[param_name] = f"[REQUIRED] {parameters[param_name]}"
+    return name, description, parameters
+def format_parameters(parameters):
+    if not parameters:
+        return '<div style="color: var(--text-muted); font-style: italic;">No parameters</div>'
+    params_html = ""
+    for name, desc in parameters.items():
+        is_required = "[REQUIRED]" in desc
+        param_style = "required" if is_required else "optional"
+        # Clean up the description to remove the REQUIRED marker but keep the info
+        cleaned_desc = desc.replace("[REQUIRED] ", "") if is_required else desc
+        params_html += f"""
+        <div style="
+            margin-bottom: 1.2rem;
+            padding-bottom: 1.2rem;
+            border-bottom: 1px solid var(--border-color);
+            last-child: border-bottom: none;">
+            <div style="
+                display: flex;
+                align-items: center;
+                justify-content: space-between;
+                margin-bottom: 0.5rem;">
+                <div style="
+                    font-weight: 600;
+                    color: var(--primary-text);
+                    font-size: 1.05rem;
+                    display: flex;
+                    align-items: center;">
+                    {name}
+                </div>
+                <div style="
+                    font-size: 0.8rem;
+                    padding: 0.2rem 0.6rem;
+                    border-radius: 12px;
+                    background-color: {f"rgba(234, 67, 53, 0.1)" if is_required else "rgba(160, 160, 160, 0.1)"};
+                    color: var(--{param_style}-color);
+                    font-weight: 500;">
+                    {f"Required" if is_required else "Optional"}
+                </div>
+            </div>
+            <div style="
+                color: var(--text-color);
+                line-height: 1.5;
+                font-size: 0.95rem;
+                opacity: 0.9;">
+                {cleaned_desc}
+            </div>
+        </div>
+        """
+    # Remove the border-bottom from the last parameter
+    params_html = params_html.replace("last-child: border-bottom: none;", "")
+    return (
+        params_html
+        + """
+    <style>
+        div:last-child {
+            border-bottom: none !important;
+            margin-bottom: 0 !important;
+            padding-bottom: 0 !important;
+        }
+    </style>
+    """
+    )
+def format_metrics(score, rationale, explanation):
+    """Format metrics display with improved visual hierarchy and dark theme support."""
+    # Determine score color and add emoji indicator
+    if score >= 0.7:
+        score_color = "var(--score-high)"
+        score_emoji = "🟢"
+        score_text = "High"
+    elif score >= 0.4:
+        score_color = "var(--score-med)"
+        score_emoji = "🟠"
+        score_text = "Medium"
+    else:
+        score_color = "var(--score-low)"
+        score_emoji = "🔴"
+        score_text = "Low"
+    return f"""
+    <div style="
+        padding: 1.75rem;
+        background-color: var(--surface-color);
+        border-radius: 10px;
+        border: 1px solid var(--border-color);
+        box-shadow: 0 3px 8px var(--shadow-color);">
+        <div style="
+            display: flex;
+            align-items: center;
+            margin-bottom: 1.75rem;
+            padding-bottom: 1.5rem;
+            border-bottom: 1px solid var(--border-color-light);">
+            <div style="flex: 1;">
+                <h3 style="
+                    color: var(--text-color);
+                    font-size: 1.2rem;
+                    margin-bottom: 0.25rem;
+                    font-weight: 600;">TSQ Score</h3>
+                <div style="
+                    display: flex;
+                    align-items: baseline;">
+                    <div style="
+                        font-size: 2.5rem;
+                        font-weight: 700;
+                        color: {score_color};">
+                        {score:.2f}
+                    </div>
+                    <div style="
+                        margin-left: 0.75rem;
+                        font-size: 1rem;
+                        color: {score_color};
+                        font-weight: 500;
+                        display: flex;
+                        align-items: center;">
+                        <span style="margin-right: 0.5rem;">{score_emoji}</span>{score_text}
+                    </div>
+                </div>
+            </div>
+        </div>
+        <div style="margin-bottom: 1.75rem;">
+            <h3 style="
+                color: var(--text-color);
+                font-size: 1.1rem;
+                margin-bottom: 0.75rem;
+                font-weight: 600;
+                display: flex;
+                align-items: center;">
+                <span style="
+                    display: inline-block;
+                    width: 18px;
+                    height: 18px;
+                    background-color: var(--primary-text-light);
+                    border-radius: 4px;
+                    margin-right: 0.5rem;"></span>
+                Rationale
+            </h3>
+            <div style="
+                color: var(--text-color);
+                line-height: 1.6;
+                padding-left: 1.5rem;
+                border-left: 3px solid var(--primary-text-light);
+                font-size: 0.95rem;">
+                {rationale}
+            </div>
+        </div>
+        <div>
+            <h3 style="
+                color: var(--text-color);
+                font-size: 1.1rem;
+                margin-bottom: 0.75rem;
+                font-weight: 600;
+                display: flex;
+                align-items: center;">
+                <span style="
+                    display: inline-block;
+                    width: 18px;
+                    height: 18px;
+                    background-color: var(--primary-text-light);
+                    border-radius: 4px;
+                    margin-right: 0.5rem;"></span>
+                Explanation
+            </h3>
+            <div style="
+                color: var(--text-color);
+                line-height: 1.6;
+                padding-left: 1.5rem;
+                border-left: 3px solid var(--primary-text-light);
+                font-size: 0.95rem;">
+                {explanation}
+            </div>
+        </div>
+    </div>
+    """
+def format_metrics_display(row):
+    """Format the metrics display with score, rationale and explanation."""
+    try:
+        score = row["score"]
+        rationale = row["rationale"]
+        explanation = row["explanation"]
+        # Determine score color and add emoji indicator
+        if score >= 0.7:
+            score_color = "var(--score-high)"
+            score_emoji = "🟢"
+            score_text = "High"
+        elif score >= 0.4:
+            score_color = "var(--score-med)"
+            score_emoji = "🟠"
+            score_text = "Medium"
+        else:
+            score_color = "var(--score-low)"
+            score_emoji = "🔴"
+            score_text = "Low"
+        metrics_html = f"""
+        <div style="
+            padding: 1.5rem;
+            background-color: var(--surface-color);
+            border-radius: 10px;
+            border: 1px solid var(--border-color);
+            box-shadow: 0 2px 6px var(--shadow-color);
+            height: 100%;
+            overflow-y: auto;
+            max-height: 600px;">
+            <div style="
+                padding-bottom: 1rem;
+                margin-bottom: 1.5rem;
+                border-bottom: 1px solid var(--border-color-light);
+                display: flex;
+                align-items: center;">
+                <div style="
+                    font-weight: 600;
+                    font-size: 1.1rem;
+                    color: var(--primary-text);">
+                    <span style="margin-right: 0.5rem;">📊</span>Evaluation Metrics
+                </div>
+            </div>
+            <div style="
+                margin-bottom: 1.5rem;
+                padding-bottom: 1.5rem;
+                border-bottom: 1px solid var(--border-color-light);">
+                <div style="
+                    display: flex;
+                    align-items: center;
+                    justify-content: space-between;">
+                    <div>
+                        <div style="
+                            font-weight: 600;
+                            margin-bottom: 0.25rem;
+                            color: var(--text-color);">
+                            TSQ Score
+                        </div>
+                        <div style="
+                            font-size: 2.5rem;
+                            font-weight: 700;
+                            color: {score_color};
+                            display: flex;
+                            align-items: center;">
+                            {score:.2f}
+                            <div style="
+                                margin-left: 0.75rem;
+                                font-size: 1rem;
+                                display: flex;
+                                align-items: center;">
+                                {score_emoji} <span style="margin-left: 0.25rem;">{score_text}</span>
+                            </div>
+                        </div>
+                    </div>
+                </div>
+            </div>
+            <div style="margin-bottom: 1.5rem;">
+                <div style="
+                    font-weight: 600;
+                    margin-bottom: 0.75rem;
+                    color: var(--text-color);
+                    display: flex;
+                    align-items: center;">
+                    <span style="
+                        display: inline-block;
+                        width: 12px;
+                        height: 12px;
+                        background-color: var(--primary-text-light);
+                        border-radius: 2px;
+                        margin-right: 0.5rem;"></span>
+                    Rationale
+                </div>
+                <div style="
+                    background-color: var(--surface-color-alt);
+                    padding: 1rem;
+                    border-radius: 8px;
+                    border-left: 3px solid var(--primary-text-light);
+                    line-height: 1.5;
+                    color: var(--text-color);
+                    font-size: 0.95rem;">
+                    {rationale}
+                </div>
+            </div>
+            <div>
+                <div style="
+                    font-weight: 600;
+                    margin-bottom: 0.75rem;
+                    color: var(--text-color);
+                    display: flex;
+                    align-items: center;">
+                    <span style="
+                        display: inline-block;
+                        width: 12px;
+                        height: 12px;
+                        background-color: var(--primary-text-light);
+                        border-radius: 2px;
+                        margin-right: 0.5rem;"></span>
+                    Explanation
+                </div>
+                <div style="
+                    background-color: var(--surface-color-alt);
+                    padding: 1rem;
+                    border-radius: 8px;
+                    border-left: 3px solid var(--primary-text-light);
+                    line-height: 1.5;
+                    color: var(--text-color);
+                    font-size: 0.95rem;">
+                    {explanation}
+                </div>
+            </div>
+        </div>
+        """
+        return metrics_html
+    except Exception as e:
+        return f"""
+        <div style="
+            padding: 1.5rem;
+            color: var(--score-low);
+            background-color: var(--surface-color);
+            border: 1px solid var(--score-low);
+            border-radius: 10px;">
+            <div style="font-weight: 600; margin-bottom: 0.5rem;">Error Formatting Metrics</div>
+            <div style="font-family: monospace; white-space: pre-wrap;">{str(e)}</div>
+        </div>
+        """
+def format_tool_info(tools_data):
+    """Format the tool information with improved styling."""
+    try:
+        if not tools_data or tools_data == "[]":
+            return """
+            <div style="
+                padding: 1.5rem;
+                text-align: center;
+                color: var(--text-muted);
+                background-color: var(--surface-color);
+                border-radius: 10px;
+                border: 1px solid var(--border-color);
+                box-shadow: 0 2px 6px var(--shadow-color);">
+                <div style="font-size: 1.5rem; margin-bottom: 0.75rem;">🔍</div>
+                <div style="font-weight: 500; margin-bottom: 0.5rem;">No Tool Information</div>
+                <div style="font-size: 0.9rem; font-style: italic;">This conversation doesn't use any tools</div>
+            </div>
+            """
+        if isinstance(tools_data, str):
+            try:
+                tools = json.loads(tools_data)
+            except:
+                tools = []
+        else:
+            tools = tools_data
+        if not tools:
+            return """
+            <div style="
+                padding: 1.5rem;
+                text-align: center;
+                color: var(--text-muted);
+                background-color: var(--surface-color);
+                border-radius: 10px;
+                border: 1px solid var(--border-color);
+                box-shadow: 0 2px 6px var(--shadow-color);">
+                <div style="font-size: 1.5rem; margin-bottom: 0.75rem;">🔍</div>
+                <div style="font-weight: 500; margin-bottom: 0.5rem;">No Tool Information</div>
+                <div style="font-size: 0.9rem; font-style: italic;">This conversation doesn't use any tools</div>
+            </div>
+            """
+        # Format each tool
+        tool_items = ""
+        for tool in tools:
+            name = tool.get("title", tool.get("name", "Unnamed Tool"))
+            description = tool.get("description", "No description available")
+            # Get parameters
+            parameters = {}
+            required_params = []
+            # Handle different schema formats
+            if "function" in tool:
+                # Function schema format
+                function_data = tool["function"]
+                name = function_data.get("name", name)
+                description = function_data.get("description", description)
+                if (
+                    "parameters" in function_data
+                    and "properties" in function_data["parameters"]
+                ):
+                    properties = function_data["parameters"]["properties"]
+                    for param_name, param_data in properties.items():
+                        param_desc = param_data.get("description", "No description")
+                        param_type = param_data.get("type", "unknown")
+                        param_default = param_data.get("default", "None")
+                        parameters[param_name] = {
+                            "description": param_desc,
+                            "type": param_type,
+                            "default": param_default,
+                        }
+                    required_params = function_data.get("parameters", {}).get(
+                        "required", []
+                    )
+            elif "properties" in tool:
+                # Original schema format
+                if "properties" in tool:
+                    for param_name, param_data in tool["properties"].items():
+                        param_desc = param_data.get("description", "No description")
+                        param_type = param_data.get("type", "unknown")
+                        param_title = param_data.get("title", param_name)
+                        parameters[param_name] = {
+                            "description": param_desc,
+                            "type": param_type,
+                            "title": param_title,
+                        }
+                    required_params = tool.get("required", [])
+            # Format parameters
+            params_html = ""
+            if parameters:
+                for param_name, param_data in parameters.items():
+                    is_required = param_name in required_params
+                    param_style = "required" if is_required else "optional"
+                    params_html += f"""
+                    <div style="
+                        margin-bottom: 1rem;
+                        padding-bottom: 1rem;
+                        border-bottom: 1px solid var(--border-color-light);">
+                        <div style="
+                            display: flex;
+                            align-items: center;
+                            justify-content: space-between;
+                            margin-bottom: 0.5rem;">
+                            <div style="
+                                font-weight: 600;
+                                color: var(--primary-text);
+                                font-size: 0.95rem;">
+                                {param_name}
+                            </div>
+                            <div style="
+                                font-size: 0.75rem;
+                                padding: 0.15rem 0.5rem;
+                                border-radius: 12px;
+                                background-color: {f"rgba(234, 67, 53, 0.1)" if is_required else "rgba(160, 160, 160, 0.1)"};
+                                color: {f"var(--score-low)" if is_required else "var(--text-muted)"};
+                                font-weight: 500;">
+                                {f"Required" if is_required else "Optional"}
+                            </div>
+                        </div>
+                        <div style="
+                            color: var(--text-muted);
+                            line-height: 1.5;
+                            font-size: 0.85rem;
+                            margin-bottom: 0.25rem;">
+                            {param_data.get("description", "No description")}
+                        </div>
+                        <div style="
+                            display: flex;
+                            font-size: 0.8rem;
+                            color: var(--text-muted);">
+                            <div style="margin-right: 1rem;">
+                                <span style="font-weight: 500;">Type:</span> {param_data.get("type", "unknown")}
+                            </div>
+                            {f'<div><span style="font-weight: 500;">Default:</span> {param_data.get("default", "None")}</div>' if "default" in param_data else ''}
+                        </div>
+                    </div>
+                    """
+            else:
+                params_html = """
+                <div style="
+                    color: var(--text-muted);
+                    font-style: italic;
+                    padding: 0.75rem;
+                    text-align: center;
+                    font-size: 0.9rem;">
+                    No parameters
+                </div>
+                """
+            # Remove border from last parameter
+            params_html += """
+            <style>
+                .tool-params > div:last-child {
+                    border-bottom: none !important;
+                    margin-bottom: 0 !important;
+                    padding-bottom: 0 !important;
+                }
+            </style>
+            """
+            tool_items += f"""
+            <div style="
+                margin-bottom: 1.5rem;
+                padding: 1.5rem;
+                border-radius: 8px;
+                background-color: var(--surface-color-alt);
+                border: 1px solid var(--border-color);
+                box-shadow: 0 1px 3px var(--shadow-color);">
+                <div style="
+                    font-weight: 600;
+                    color: var(--primary-text);
+                    margin-bottom: 0.75rem;
+                    font-size: 1.05rem;
+                    display: flex;
+                    align-items: center;">
+                    <span style="margin-right: 8px;">⚙️</span> {name}
+                </div>
+                <div style="
+                    color: var(--text-color);
+                    margin-bottom: 1.25rem;
+                    line-height: 1.5;
+                    font-size: 0.95rem;
+                    padding-left: 0.5rem;
+                    border-left: 3px solid var(--primary-text-light);">
+                    {description}
+                </div>
+                <div style="
+                    font-weight: 600;
+                    color: var(--text-color);
+                    margin-bottom: 0.75rem;
+                    font-size: 0.9rem;">
+                    Parameters:
+                </div>
+                <div class="tool-params">
+                    {params_html}
+                </div>
+            </div>
+            """
+        full_tools_html = f"""
+        <div style="
+            padding: 1.5rem;
+            background-color: var(--surface-color);
+            border-radius: 10px;
+            border: 1px solid var(--border-color);
+            box-shadow: 0 2px 6px var(--shadow-color);
+            height: 100%;
+            overflow-y: auto;
+            max-height: 600px;">
+            <div style="
+                padding-bottom: 1rem;
+                margin-bottom: 1.5rem;
+                border-bottom: 1px solid var(--border-color-light);
+                display: flex;
+                align-items: center;">
+                <div style="
+                    font-weight: 600;
+                    font-size: 1.1rem;
+                    color: var(--primary-text);">
+                    <span style="margin-right: 0.5rem;">🛠️</span>Available Tools
+                </div>
+            </div>
+            {tool_items}
+        </div>
+        """
+        return full_tools_html
+    except Exception as e:
+        return f"""
+        <div style="
+            padding: 1.5rem;
+            color: var(--score-low);
+            background-color: var(--surface-color);
+            border: 1px solid var(--score-low);
+            border-radius: 10px;">
+            <div style="font-weight: 600; margin-bottom: 0.5rem;">Error Formatting Tool Info</div>
+            <div style="font-family: monospace; white-space: pre-wrap;">{str(e)}</div>
+        </div>
+        """

data_loader.py ADDED Viewed

	@@ -0,0 +1,457 @@

+import pandas as pd
+def load_data():
+    """Load and preprocess the data."""
+    df = pd.read_csv("leaderboard.csv").dropna()
+    return df
+df = load_data()
+MODELS = [x.strip() for x in df["Model"].unique().tolist()]
+COMMON = """
+<style>
+    @media (prefers-color-scheme: dark) {
+        :root {
+            --bg-primary: #0B0B19;
+            --bg-secondary: rgba(19, 19, 37, 0.4);
+            --bg-hover: rgba(30, 30, 45, 0.95);
+            --text-primary: #ffffff;
+            --text-secondary: #e2e8f0;
+            --text-tertiary: #e2e8f0;
+            --card-bg: rgba(17, 17, 27, 0.4);
+            --border-color: rgba(31, 41, 55, 0.5);
+            --border-hover: rgba(79, 70, 229, 0.4);
+            --accent-color: #ffffff;
+            --accent-bg: rgba(79, 70, 229, 0.1);
+            --blue-gradient: linear-gradient(45deg, #3B82F6, #A8C4F0);;
+            --orange-gradient: linear-gradient(45deg, #E05205, #FAD8D2);
+            --green-gradient: linear-gradient(45deg, #60cc1c, #a0e65e);
+            --shadow-color: rgba(0, 0, 0, 0.2);
+        }
+    }
+    @media (prefers-color-scheme: light) {
+        :root {
+            --bg-primary: #ffffff;
+            --bg-secondary: rgba(243, 244, 246, 0.4);
+            --bg-hover: rgba(229, 231, 235, 0.95);
+            --text-primary: #1F2937;
+            --text-secondary: #4B5563;
+            --text-tertiary: #6B7280;
+            --card-bg: rgba(249, 250, 251, 0.4);
+            --border-color: rgba(209, 213, 219, 0.5);
+            --border-hover: rgba(79, 70, 229, 0.4);
+            --accent-color: #4F46E5;
+            --accent-bg: rgba(79, 70, 229, 0.1);
+            --blue-gradient: linear-gradient(45deg, #3B82F6, #A8C4F0);;
+            --orange-gradient: linear-gradient(45deg, #E05205, #FF8340);
+            --green-gradient: linear-gradient(45deg, #60cc1c, #a0e65e);
+            --shadow-color: rgba(0, 0, 0, 0.1);
+        }
+    }
+</style>
+"""
+# Define constants for the links
+PAPER_LINK = "https://github.com/jkhouja/L2"
+CODE_LINK = "https://github.com/jkhouja/L2"
+BLOG_LINK = "https://github.com/jkhouja/L2"
+DATASET_LINK = "https://huggingface.co/datasets/jkhouja/LingOly-TOO"
+ADD_MODEL_LINK = (
+    "https://mail.google.com/mail/?view=cm&fs=1&to=jude.khouja@oii.ox.ac.uk"
+    "&su=Get%20Model%20Added%20to%20Leaderboard&body=Hi%20there%2C%0A%0AI%20"
+    "would%20like%20to%20add%20my%20model%20to%20the%20Lingoly-TOO%20Leaderboard.%0A%0AModel%20Name%3A%0AModel%20URL%3A%0A%0ABest%20regards"
+)
+HEADER_CONTENT = (
+    COMMON
+    + f"""
+<style>
+    .header-wrapper {{
+        position: relative;
+        background: var(--bg-primary);
+        padding: 4rem 2rem;
+        border-radius: 16px;
+        margin-bottom: 0;
+        transition: all 0.3s ease;
+    }}
+    .header-content {{
+        max-width: 72rem;
+        margin: 0 auto;
+    }}
+    .title-section {{
+        position: relative;
+        display: flex;
+        align-items: center;
+        justify-content: center;
+        margin-bottom: 3rem;
+    }}
+    .title-gradient {{
+        font-size: 5rem;
+        font-weight: 800;
+        line-height: 1.1;
+        background: var(--orange-gradient);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+        margin-bottom: 0.5rem;
+    }}
+    .title-image {{
+        position: absolute;
+        top: 30px;
+        left: 30px;
+        width: 100px;
+        height: 100px;
+        /* To make it look ok on dark mode */
+        background-color: #ffffffd0;
+        padding: 10px;
+        border-radius: 6px;
+    }}
+    .subtitle-white {{
+        font-size: 5rem;
+        font-weight: 800;
+        line-height: 1.1;
+        color: var(--text-primary);
+        margin-bottom: 3rem;
+        transition: color 0.3s ease;
+    }}
+    .description {{
+        color: var(--text-secondary);
+        font-size: 1.25rem;
+        line-height: 1.75;
+        max-width: 800px;
+        margin: 0 auto;
+        text-align: center;
+        transition: color 0.3s ease;
+    }}
+    .highlight-question {{
+        background: var(--blue-gradient);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+        display: block;
+        margin-top: 1rem;
+        font-size: 1.5rem;
+        font-weight: 500;
+    }}
+    .metrics-grid {{
+        display: grid;
+        grid-template-columns: repeat(3, 1fr);
+        gap: 1.5rem;
+        margin-top: 4rem;
+    }}
+    .metric-card {{
+        background: var(--bg-secondary);
+        border: 1px solid var(--border-color);
+        text-align: center;
+        border-radius: 1rem;
+        padding: 2rem;
+        transition: all 0.3s ease;
+        align-items: center;
+    }}
+    .metric-card:hover {{
+        transform: translateY(-5px);
+        border-color: var(--border-hover);
+        box-shadow: 0 4px 20px var(--shadow-color);
+    }}
+    .metric-number {{
+        font-size: 4rem;
+        font-weight: 800;
+        margin-bottom: 1rem;
+    }}
+    .metric-blue {{
+        background: var(--blue-gradient);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+    }}
+    .metric-purple {{
+        background: var(--orange-gradient);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+    }}
+    .metric-green {{
+        background: var(--green-gradient);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+    }}
+    .metric-label {{
+        color: var(--text-secondary);
+        font-size: 1.5rem;
+        margin-bottom: 1.5rem;
+        transition: color 0.3s ease;
+    }}
+    .metric-detail {{
+        font-size: 1.125rem;
+        line-height: 1.75;
+        margin-top: 0.5rem;
+        transition: color 0.3s ease;
+    }}
+    .metric-detail.primary {{
+        color: var(--accent-color);
+    }}
+    .metric-detail.secondary {{
+        color: var(--text-secondary);
+    }}
+    .actions {{
+        display: flex;
+        gap: 1rem;
+        justify-content: center;
+        margin-top: 3rem;
+    }}
+    .action-button {{
+        display: flex;
+        align-items: center;
+        gap: 0.5rem;
+        padding: 0.75rem 1.5rem;
+        background: var(--bg-secondary);
+        border: 1px solid var(--border-color);
+        border-radius: 100px;
+        color: var(--text-primary) !important;
+        text-decoration: none !important;
+        font-size: 0.95rem;
+        transition: all 0.3s ease;
+    }}
+    .action-button:hover {{
+        transform: translateY(-2px);
+        border-color: var(--accent-color);
+        background: var(--accent-bg);
+    }}
+    @media (max-width: 1024px) {{
+        .title-image {{
+            top: 20px;
+            left: 20px;
+            width: 80px;
+            height: 80px;
+        }}
+        .title-gradient, .subtitle-white {{
+            font-size: 3rem;
+        }}
+    }}
+    @media (max-width: 620px) {{
+        .title-image {{
+            position: relative;
+            margin-top: -30px !important;
+            margin-bottom: 20px !important;
+            top: 0;
+            left: 0;
+        }}
+    }}
+</style>
+<div class="header-wrapper">
+    <div class="header-content">
+        <div class="title-section">
+            <div class="title-gradient">LingOly-TOO</div>
+        </div>
+        <div class="description">
+            LingOly-TOO (L2) is a challenging reasoning benchmark designed to minimize the chance of answering by guessing.
+            It is developed by rewriting (obfuscating) parts of questions and answers so that the chance of leakage in training data is minimum.
+            <div class="highlight-question">
+                "How do top LLMs reason on unseen linguistic questions?"
+            </div>
+        </div>
+    </div>
+    <div class="actions">
+        <a href="{PAPER_LINK}" class="action-button">
+            <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                <path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/>
+                <line x1="8" y1="12" x2="16" y2="12"/>
+            </svg>
+            Paper
+        </a>
+        <a href="{CODE_LINK}" class="action-button">
+            <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                <path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/>
+            </svg>
+            Code
+        </a>
+        <a href="{BLOG_LINK}" class="action-button">
+            <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
+                <polyline points="7 10 12 15 17 10"/>
+                <line x1="12" y1="15" x2="12" y2="3"/>
+            </svg>
+            Blog
+        </a>
+        <a href="{DATASET_LINK}" class="action-button">
+            <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
+                <polyline points="7 10 12 15 17 10"/>
+                <line x1="12" y1="15" x2="12" y2="3"/>
+            </svg>
+            Dataset
+        </a>
+        <a href="{ADD_MODEL_LINK}" class="action-button" target="_blank" rel="noopener noreferrer">
+    <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+        <path d="M19 3H5a2 2 0 0 0-2 2v14a2 2 0 0 0 2 2h14a2 2 0 0 0 2-2V5a2 2 0 0 0-2-2z"/>
+        <line x1="12" y1="8" x2="12" y2="16"/>
+        <line x1="8" y1="12" x2="16" y2="12"/>
+    </svg>
+    Add Your Model
+</a>
+    </div>
+</div>
+"""
+)
+CARDS = """        <div class="metrics-grid">
+            <div class="metric-card">
+                <div class="metric-number metric-blue">11</div>
+                <div class="metric-label">Total Models</div>
+                <div class="metric-detail primary">4 Reasoning Models</div>
+                <div class="metric-detail primary">4 Open Source Models</div>
+            </div>
+            <div class="metric-card">
+                <div class="metric-number metric-purple">82</div>
+                <div class="metric-label">Linguistics Problems</div>
+                <div class="metric-detail primary">6 Permutations per problem</div>
+                <div class="metric-detail primary">Problems from Low-resource Languages</div>
+            </div>
+            <div class="metric-card">
+                <div class="metric-number metric-green">1.2k</div>
+                <div class="metric-label">Total Questions</div>
+                <div class="metric-detail primary">Includes Match-Up, Multiple Choice and Completion</div>
+            </div>
+        </div>"""
+METHODOLOGY = """
+<style>
+    @media (prefers-color-scheme: dark) {
+        :root {
+            --bg-primary: #0B0B19;
+            --bg-secondary: rgba(19, 19, 37, 0.4);
+            --text-primary: #ffffff;
+            --text-secondary: #94A3B8;
+            --border-primary: rgba(31, 41, 55, 0.5);
+            --accent-blue: #60A5FA;
+            --accent-purple: #A78BFA;
+            --card-hover-bg: rgba(79, 70, 229, 0.1);
+            --shadow-color: rgba(79, 70, 229, 0.1);
+        }
+    }
+    @media (prefers-color-scheme: light) {
+        :root {
+            --bg-primary: #ffffff;
+            --bg-secondary: rgba(243, 244, 246, 0.4);
+            --text-primary: #111827;
+            --text-secondary: #4B5563;
+            --border-primary: rgba(209, 213, 219, 0.5);
+            --accent-blue: #3B82F6;
+            --accent-purple: #8B5CF6;
+            --card-hover-bg: rgba(243, 244, 246, 0.8);
+            --shadow-color: rgba(0, 0, 0, 0.1);
+        }
+    }
+    .dataset-table {
+        width: 100%;
+        border-collapse: separate;
+        border-spacing: 0;
+        margin: 2rem 0;
+        background: var(--bg-secondary);
+        border-radius: 1rem;
+        overflow: hidden;
+        box-shadow: 0 4px 20px var(--shadow-color);
+    }
+    .dataset-table thead {
+        background: linear-gradient(90deg, var(--accent-blue), var(--accent-purple));
+    }
+    .dataset-table th {
+        padding: 1.25rem 1rem;
+        text-align: left;
+        color: white;
+        font-weight: 600;
+        font-size: 1rem;
+    }
+    .dataset-table td {
+        padding: 1rem;
+        border-bottom: 1px solid var(--border-primary);
+        color: var(--text-secondary);
+        transition: all 0.2s ease;
+    }
+    .dataset-table tbody tr:hover td {
+        background: var(--card-hover-bg);
+        color: var(--text-primary);
+    }
+    .methodology-content {
+        max-width: 1200px;
+        margin: 0 auto;
+        padding: 2rem;
+        color: var(--text-secondary);
+        line-height: 1.7;
+        font-size: 1rem;
+    }
+    .section-title {
+        font-size: 2.5rem;
+        font-weight: 700;
+        margin: 3rem 0 1.5rem;
+        color: var(--text-primary);
+        background: linear-gradient(to right, var(--accent-blue), var(--accent-purple));
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+        letter-spacing: -0.02em;
+    }
+</style>
+<div class="section-divider"></div>
+    <h1 class="section-title">Citation</h2>
+    <div class="bibtex-citation" style="font-family: monospace; white-space: pre; padding: 1em; background-color: rgba(128, 128, 128, 0.1); border: 1px solid rgba(128, 128, 128, 0.2); border-radius: 4px; color: currentColor;">@article{lingoly-too2025,
+    author = {Khouja, Jude and Korgul, Karolina and Hellsten, Simeon and Yang, Lingyi
+    and Neacșu, Vlad A. and Mayne, Harry and Kearns, Ryan O. and Bean, Andrew M. and Mahdi, Adam},
+    title = {LINGOLY-TOO: Disentangling Memorisation from Reasoning with Linguistic Templatisation and Orthographic Obfuscation},
+    year = {2025},
+    primaryClass={cs.CL},
+    archivePrefix=,
+}</div>
+"""
+UNUSED = """
+<!-- Insights Section -->
+<h1 class="section-title">Key insights</h1>
+<p>
+    We use orthographic templatisation on Linguistics Olympiad problems to create obfuscated variants
+    that maintain the same reasoning steps. Through extensive experiments, we show that obfuscation
+    reduces measurement bias from data exposure and provides reasoning estimates that correlate with
+    the ability to solve linguistic reasoning problems. Additionally, we find that state-of-the-art
+    models exhibit inconsistent reasoning abilities and that simple fine-tuning does not necessarily
+    equip models with context-free and robust problem-solving skills. This work establishes a reasoning
+    measure that is resilient to data exposure effects and supports ongoing efforts to fully understand
+    response generation in advanced models.
+</p>
+"""

leaderboard.csv ADDED Viewed

	@@ -0,0 +1,12 @@

+Model,Provider,Type,Baseline score,Obfuscated score
+Aya 23 35B,Cohere,Open source,0.10654349746757057,0.05708180119638717
+Claude 3.5 Sonnet,Anthropic,Closed source,0.48255271180599657,0.2810140963355337
+Claude 3.7 Sonnet,Anthropic,Closed source,0.5994013309112796,0.4357505520191723
+GPT 4.5,OpenAI,Closed source,0.4208265195574057,0.2545024812218498
+GPT 4o,OpenAI,Closed source,0.31371291749661456,0.1563339989919302
+Gemini 1.5 Pro,Google,Closed source,0.3690345167304693,0.20461522579355207
+Llama 3.3 70B-Instruct,Meta,Open source,0.11452795751175084,0.08213118755937426
+Phi4,Microsoft,Open source,0.1809802769595679,0.10996628714372364
+DeepSeek R1,DeepSeek,Open source,0.3965527162895584,0.2649618642615188
+o1-preview,OpenAI,Closed source,0.47730527712315257,0.3222020975619888
+o3-mini,OpenAI,Closed source,0.42172257807447155,0.3059086523804619

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio==5.18.0
+pandas
+matplotlib
+plotly

results.csv ADDED Viewed

	@@ -0,0 +1,22 @@

+Model,Model Type,Model Output Type,Vendor,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
+claude-3-7-sonnet-20250219,Private,Reasoning,Anthropic,3,15,0.953,0.96,0.95,0.92,0.96,1,0.95,0.97,1,0.96,0.94,0.97,0.96,0.99,0.82,0.92,0.975
+gemini-2.0-flash-001,Private,Normal,Google,0.15,0.6,0.938,0.95,0.93,0.91,0.94,0.9,0.96,0.92,0.95,0.89,0.91,0.98,0.93,0.97,0.98,0.93,0.965
+gemini-2.0-flash-lite-001,Private,Normal,Google,0.075,0.3,0.933,0.96,0.91,0.81,0.98,0.98,0.9,0.91,0.92,0.98,0.86,0.99,0.87,0.97,0.96,0.95,0.975
+gpt-4o-2024-11-20,Private,Normal,OpenAI,2.5,10,0.900,0.92,0.88,0.85,0.9,0.92,0.95,0.88,0.99,0.63,0.83,0.98,0.89,0.98,0.98,0.86,0.965
+gpt-4.5-preview-2025-02-27,Private,Normal,OpenAI,75,150,0.900,0.93,0.87,0.85,0.91,0.92,0.97,0.92,0.99,0.67,0.85,0.98,0.85,1,0.98,0.8,0.915
+gemini-1.5-flash,Private,Normal,Google,0.075,0.3,0.895,0.88,0.91,0.9,0.9,0.89,0.87,0.91,0.83,0.71,0.87,0.98,0.89,0.94,0.93,0.92,0.99
+gemini-1.5-pro,Private,Normal,Google,1.25,5,0.885,0.87,0.91,0.89,0.93,0.75,0.97,0.9,0.87,0.57,0.91,0.94,0.92,0.99,0.97,0.86,0.925
+o1-2024-12-17,Private,Reasoning,OpenAI,15,60,0.876,0.83,0.92,0.89,0.92,0.98,0.71,0.91,0.99,0.73,0.88,0.98,0.96,1,0.43,0.94,0.95
+o3-mini-2025-01-31,Private,Reasoning,OpenAI,1.1,4.4,0.847,0.80,0.90,0.87,0.91,0.84,0.72,0.93,0.98,0.63,0.85,0.97,0.84,1,0.43,0.91,0.975
+mistral-small-2501,Open source,Normal,Mistral,0.1,0.3,0.832,0.88,0.78,0.83,0.78,0.92,0.97,0.76,0.99,0.62,0.8,0.82,0.77,0.95,0.92,0.74,0.775
+gpt-4o-mini,Private,Normal,OpenAI,0.15,0.6,0.832,0.85,0.82,0.82,0.85,0.51,0.98,0.83,1,0.54,0.83,0.94,0.83,0.96,0.99,0.73,0.835
+qwen2.5-72b-instruct,Open source,Normal,Alibaba,0.9,0.9,0.817,0.80,0.84,0.84,0.87,0.92,0.63,0.86,0.99,0.66,0.79,0.99,0.77,0.97,0.42,0.78,0.95
+mistral-large-2411,Private,Normal,Mistral,2,6,0.810,0.87,0.75,0.77,0.76,0.83,0.93,0.75,0.97,0.65,0.77,0.87,0.78,0.9,0.94,0.7,0.725
+claude-3-5-sonnet-20241022,Private,Normal,Anthropic,3,15,0.801,0.83,0.77,0.68,0.81,0.68,0.78,0.85,0.91,0.92,0.67,0.9,0.75,0.74,0.88,0.69,0.955
+Llama-3.3-70B-Instruct-Turbo,Open source,Normal,Meta,0.9,0.9,0.774,0.86,0.69,0.85,0.5,0.72,0.87,0.57,0.99,0.61,0.79,0.9,0.73,0.93,0.97,0.54,0.865
+claude-3-5-haiku-20241022,Private,Normal,Anthropic,0.8,4,0.765,0.78,0.75,0.72,0.72,0.72,0.79,0.79,0.85,0.76,0.73,0.84,0.69,0.65,0.88,0.66,0.905
+mistral-small-2409,Private,Normal,Mistral,0.2,0.6,0.750,0.82,0.68,0.7,0.77,0.72,0.98,0.7,1,0.42,0.77,0.84,0.78,0.93,0.85,0.62,0.425
+ministral-8b-2410,Private,Normal,Mistral,0.1,0.1,0.689,0.73,0.65,0.75,0.59,0.73,0.98,0.66,0.98,0.34,0.78,0.24,0.81,0.9,0.95,0.53,0.41
+Meta-Llama-3.1-8B-Instruct-Turbo,Open source,Normal,Meta,0.2,0.2,0.678,0.71,0.64,0.77,0.49,0.44,0.96,0.66,0.98,0.25,0.73,0.48,0.76,0.93,0.96,0.51,0.575
+open-mistral-nemo-2407,Open source,Normal,Mistral,0.15,0.15,0.661,0.68,0.64,0.7,0.64,0.51,0.98,0.68,0.99,0.26,0.78,0.21,0.75,0.9,0.94,0.51,0.41
+Dataset Avg,,,,,,,0.84,0.81,0.82,0.81,0.79,0.89,0.82,0.96,0.64,0.82,0.84,0.83,0.93,0.86,0.76,0.82

tabs/leaderboard.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import gradio as gr
+from data_loader import METHODOLOGY
+from utils import (
+    get_rank_badge,
+    get_score_bar,
+    get_type_badge,
+)
+def filter_leaderboard(df, sort_by):
+    filtered_df = df.copy()
+    if sort_by == "Score after obfuscation":
+        filtered_df = filtered_df.sort_values(by="Obfuscated score", ascending=False)
+    else:
+        filtered_df = filtered_df.sort_values(by="Baseline score", ascending=False)
+    filtered_df["Rank"] = range(1, len(filtered_df) + 1)
+    # Generate styled table HTML
+    table_html = f"""
+    <style>
+        @media (prefers-color-scheme: dark) {{
+            :root {{
+                --bg-color: #1a1b1e;
+                --text-color: #ffffff;
+                --border-color: #2d2e32;
+                --hover-bg: #2d2e32;
+                --note-bg: #2d2e32;
+                --note-text: #a1a1aa;
+                --accent-blue: #60A5FA;
+                --accent-purple: #A78BFA;
+                --accent-pink: #F472B6;
+                --score-bg: rgba(255, 255, 255, 0.1);
+            }}
+        }}
+        @media (prefers-color-scheme: light) {{
+            :root {{
+                --bg-color: #ffffff;
+                --text-color: #000000;
+                --border-color: #e5e7eb;
+                --hover-bg: #f3f4f6;
+                --note-bg: #f3f4f6;
+                --note-text: #4b5563;
+                --accent-blue: #3B82F6;
+                --accent-purple: #8B5CF6;
+                --accent-pink: #EC4899;
+                --score-bg: rgba(0, 0, 0, 0.1);
+            }}
+        }}
+        .dark-table-container {{
+            background: var(--bg-color);
+            border-radius: 12px;
+            padding: 1px;
+            margin: 20px 0;
+        }}
+        .dark-styled-table {{
+            width: 100%;
+            border-collapse: collapse;
+            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
+            background: var(--bg-color);
+            color: var(--text-color);
+        }}
+        .dark-styled-table thead {{
+            position: sticky;
+            top: 0;
+            background: var(--bg-color);
+            z-index: 1;
+        }}
+        .dark-styled-table th {{
+            padding: 16px;
+            text-align: left;
+            font-weight: 500;
+            color: var(--text-color);
+            border-bottom: 1px solid var(--border-color);
+        }}
+        .dark-styled-table td {{
+            padding: 16px;
+            border-bottom: 1px solid var(--border-color);
+            color: var(--text-color);
+        }}
+        .dark-styled-table tbody tr:hover {{
+            background: var(--hover-bg);
+        }}
+        .model-cell {{
+            font-weight: 500;
+        }}
+        .score-cell {{
+            font-weight: 500;
+        }}
+        .note-box {{
+            margin-top: 20px;
+            padding: 16px;
+            background: var(--note-bg);
+            border-radius: 8px;
+            color: var(--note-text);
+        }}
+    </style>
+    <div class="dark-table-container">
+        <table class="dark-styled-table">
+            <thead>
+                <tr>
+                    <th>Rank</th>
+                    <th>Model</th>
+                    <th>Provider</th>
+                    <th>Type</th>
+                    <th>Exact match score (obfuscated questions)</th>
+                    <th>Exact match score (all questions)</th>
+                </tr>
+            </thead>
+            <tbody>
+    """
+    for _, row in filtered_df.iterrows():
+        table_html += f"""
+            <tr>
+                <td>{get_rank_badge(row['Rank'])}</td>
+                <td class="model-cell">{row['Model']}</td>
+                <td class="vendor-cell">{row['Provider']}</td>
+                <td>{get_type_badge(row['Type'])}</td>
+                <td class="score-cell">{get_score_bar(row['Obfuscated score'])}</td>
+                <td class="score-cell">{get_score_bar(row['Baseline score'])}</td>
+            </tr>
+        """
+    return table_html
+def create_leaderboard_tab(df, HEADER_CONTENT, CARDS):
+    gr.HTML(HEADER_CONTENT + CARDS)
+    # Filters row
+    with gr.Row(equal_height=True):
+        with gr.Column(scale=0.4):
+            sort_by = gr.Dropdown(
+                choices=["Score after obfuscation", "Score on all"],
+                value="Score after obfuscation",
+                label="Sort by",
+            )
+    # Content
+    output = gr.HTML()
+    sort_by.change(
+        fn=lambda s: filter_leaderboard(df, s),
+        inputs=[sort_by],
+        outputs=[output],
+    )
+    return output

utils.py ADDED Viewed

	@@ -0,0 +1,107 @@

+def get_rank_badge(rank):
+    """Generate HTML for rank badge with appropriate styling"""
+    badge_styles = {
+        1: ("1st", "linear-gradient(145deg, #ffd700, #ffc400)", "#000"),
+        2: ("2nd", "linear-gradient(145deg, #9ca3af, #787C7E)", "#fff"),
+        3: ("3rd", "linear-gradient(145deg, #CD7F32, #b36a1d)", "#fff"),
+    }
+    if rank in badge_styles:
+        label, gradient, text_color = badge_styles[rank]
+        return f"""
+            <div style="
+                display: inline-flex;
+                align-items: center;
+                justify-content: center;
+                min-width: 48px;
+                padding: 4px 12px;
+                background: {gradient};
+                color: {text_color};
+                border-radius: 6px;
+                font-weight: 600;
+                font-size: 0.9em;
+                box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
+            ">
+                {label}
+            </div>
+        """
+    return f"""
+        <div style="
+            display: inline-flex;
+            align-items: center;
+            justify-content: center;
+            min-width: 28px;
+            color: #a1a1aa;
+            font-weight: 500;
+        ">
+            {rank}
+        </div>
+    """
+def get_score_bar(score):
+    """Generate HTML for score bar with gradient styling"""
+    width = score * 100
+    return f"""
+        <div style="display: flex; align-items: center; gap: 12px; width: 100%;">
+            <div style="
+                flex-grow: 1;
+                height: 8px;
+                background: var(--score-bg, rgba(255, 255, 255, 0.1));
+                border-radius: 4px;
+                overflow: hidden;
+                max-width: 200px;
+            ">
+                <div style="
+                    width: {width}%;
+                    height: 100%;
+                    background: linear-gradient(90deg, var(--accent-blue, #60A5FA), var(--accent-orange, #E05205));
+                    border-radius: 4px;
+                    transition: width 0.3s ease;
+                "></div>
+            </div>
+            <span style="
+                font-family: 'SF Mono', monospace;
+                font-weight: 600;
+                color: var(--text-primary, #ffffff);
+                min-width: 60px;
+            ">{width:.1f}</span>
+        </div>
+    """
+def get_chart_colors():
+    # if is_dark_theme():
+    #     return {
+    #         "Private": "#60A5FA",  # accent-blue
+    #         "Open source": "#A78BFA",  # accent-purple
+    #         "performance_bands": ["#DCFCE7", "#FEF9C3", "#FEE2E2"],
+    #         "text": "#FFFFFF",
+    #         "background": "#1a1b1e",
+    #         "grid": (1, 1, 1, 0.1),  # RGBA tuple for grid
+    #     }
+    return {
+        "Private": "#3B82F6",  # accent-blue light
+        "Open source": "#60CC1C",  # accent-purple light
+        "performance_bands": ["#DCFCE7", "#FEF9C3", "#FEE2E2"],
+        "text": "#111827",
+        "background": "#FFFFFF",
+        "grid": (0, 0, 0, 0.1),  # RGBA tuple for grid
+    }
+def get_type_badge(model_type):
+    """Generate HTML for model type badge"""
+    colors = get_chart_colors()
+    colors = {"Closed source": colors["Private"], "Open source": colors["Open source"]}
+    bg_color = colors.get(model_type, "#4F46E5")
+    return f"""
+        <div style="
+            display: inline-flex;
+            align-items: center;
+            padding: 4px 8px;
+            background: {bg_color};
+            color: white;
+            border-radius: 4px;
+            font-size: 0.85em;
+            font-weight: 500;
+        ">
+            {model_type}
+        </div>
+    """

visualization.py ADDED Viewed

	@@ -0,0 +1,247 @@

+from utils import get_chart_colors
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import plotly.graph_objects as go
+def setup_matplotlib():
+    matplotlib.use("Agg")
+    plt.close("all")
+def get_performance_chart(df, category_name="Overall"):
+    plt.close("all")
+    colors = get_chart_colors()
+    score_column = "Category Score"
+    df_sorted = df.sort_values(score_column, ascending=True)
+    height = max(8, len(df_sorted) * 0.8)
+    fig, ax = plt.subplots(figsize=(16, height))
+    plt.rcParams.update({"font.size": 12})
+    fig.patch.set_facecolor(colors["background"])
+    ax.set_facecolor(colors["background"])
+    try:
+        bars = ax.barh(
+            np.arange(len(df_sorted)),
+            df_sorted[score_column],
+            height=0.4,
+            capstyle="round",
+            color=[colors[t] for t in df_sorted["Model Type"]],
+        )
+        ax.set_title(
+            f"Model Performance - {category_name}",
+            pad=20,
+            fontsize=20,
+            fontweight="bold",
+            color=colors["text"],
+        )
+        ax.set_xlabel(
+            "Average Score (Tool Selection Quality)",
+            fontsize=14,
+            labelpad=10,
+            color=colors["text"],
+        )
+        ax.set_xlim(0.0, 1.0)
+        ax.set_yticks(np.arange(len(df_sorted)))
+        ax.set_yticklabels(df_sorted["Model"], fontsize=12, color=colors["text"])
+        plt.subplots_adjust(left=0.35)
+        for i, v in enumerate(df_sorted[score_column]):
+            ax.text(
+                v + 0.01,
+                i,
+                f"{v:.3f}",
+                va="center",
+                fontsize=12,
+                fontweight="bold",
+                color=colors["text"],
+            )
+        ax.grid(True, axis="x", linestyle="--", alpha=0.2, color=colors["grid"])
+        ax.spines[["top", "right"]].set_visible(False)
+        ax.spines[["bottom", "left"]].set_color(colors["grid"])
+        ax.tick_params(colors=colors["text"])
+        legend_elements = [
+            plt.Rectangle((0, 0), 1, 1, facecolor=color, label=label)
+            for label, color in {
+                k: colors[k] for k in ["Private", "Open source"]
+            }.items()
+        ]
+        ax.legend(
+            handles=legend_elements,
+            title="Model Type",
+            loc="lower right",
+            fontsize=12,
+            title_fontsize=14,
+            facecolor=colors["background"],
+            labelcolor=colors["text"],
+        )
+        plt.tight_layout()
+        return fig
+    finally:
+        plt.close(fig)
+def create_radar_plot(df, model_names):
+    datasets = [col for col in df.columns[7:] if col != "IO Cost"]
+    fig = go.Figure()
+    colors = ["rgba(99, 102, 241, 0.3)", "rgba(34, 197, 94, 0.3)"]
+    line_colors = ["#4F46E5", "#16A34A"]
+    for idx, model_name in enumerate(model_names):
+        model_data = df[df["Model"] == model_name].iloc[0]
+        values = [model_data[m] for m in datasets]
+        values.append(values[0])
+        datasets_plot = datasets + [datasets[0]]
+        fig.add_trace(
+            go.Scatterpolar(
+                r=values,
+                theta=datasets_plot,
+                fill="toself",
+                fillcolor=colors[idx % len(colors)],
+                line=dict(color=line_colors[idx % len(line_colors)], width=2),
+                name=model_name,
+                text=[f"{val:.3f}" for val in values],
+                textposition="middle right",
+                mode="lines+markers+text",
+            )
+        )
+    fig.update_layout(
+        polar=dict(
+            radialaxis=dict(
+                visible=True, range=[0, 1], showline=False, tickfont=dict(size=12)
+            ),
+            angularaxis=dict(
+                tickfont=dict(size=13, family="Arial"),
+                rotation=90,
+                direction="clockwise",
+            ),
+        ),
+        showlegend=True,
+        legend=dict(
+            orientation="h",
+            yanchor="bottom",
+            y=-0.2,
+            xanchor="center",
+            x=0.5,
+            font=dict(size=14),
+        ),
+        title=dict(
+            text="Model Comparison",
+            x=0.5,
+            y=0.95,
+            font=dict(size=24, family="Arial", color="#1F2937"),
+        ),
+        paper_bgcolor="white",
+        plot_bgcolor="white",
+        height=700,
+        width=900,
+        margin=dict(t=100, b=100, l=80, r=80),
+    )
+    return fig
+def get_performance_cost_chart(df, category_name="Overall"):
+    colors = get_chart_colors()
+    fig, ax = plt.subplots(figsize=(12, 8), dpi=300)
+    fig.patch.set_facecolor(colors["background"])
+    ax.set_facecolor(colors["background"])
+    ax.grid(True, linestyle="--", alpha=0.15, which="both", color=colors["grid"])
+    score_column = "Category Score"
+    for _, row in df.iterrows():
+        color = colors[row["Model Type"]]
+        size = 100 if row[score_column] > 0.85 else 80
+        edge_color = (
+            colors["Private"]
+            if row["Model Type"] == "Private"
+            else colors["Open source"]
+        )
+        ax.scatter(
+            row["IO Cost"],
+            row[score_column] * 100,
+            c=color,
+            s=size,
+            alpha=0.9,
+            edgecolor=edge_color,
+            linewidth=1,
+            zorder=5,
+        )
+        bbox_props = dict(
+            boxstyle="round,pad=0.3", fc=colors["background"], ec="none", alpha=0.8
+        )
+        ax.annotate(
+            f"{row['Model']}\n(${row['IO Cost']:.2f})",
+            (row["IO Cost"], row[score_column] * 100),
+            xytext=(5, 5),
+            textcoords="offset points",
+            fontsize=8,
+            color=colors["text"],
+            bbox=bbox_props,
+            zorder=6,
+        )
+    ax.set_xscale("log")
+    ax.set_xlim(0.08, 40)
+    ax.set_ylim(60, 100)
+    ax.set_xlabel(
+        "I/O Cost per Million Tokens ($)",
+        fontsize=10,
+        labelpad=10,
+        color=colors["text"],
+    )
+    ax.set_ylabel(
+        "Model Performance Score", fontsize=10, labelpad=10, color=colors["text"]
+    )
+    legend_elements = [
+        plt.scatter([], [], c=colors[label], label=label, s=80)
+        for label in ["Private", "Open source"]
+    ]
+    ax.legend(
+        handles=legend_elements,
+        loc="upper right",
+        frameon=True,
+        facecolor=colors["background"],
+        edgecolor="none",
+        fontsize=9,
+        labelcolor=colors["text"],
+    )
+    ax.set_title(
+        f"Performance vs. Cost - {category_name}",
+        fontsize=14,
+        pad=15,
+        fontweight="bold",
+        color=colors["text"],
+    )
+    for y1, y2, color in zip([85, 75, 60], [100, 85, 75], colors["performance_bands"]):
+        ax.axhspan(y1, y2, alpha=0.2, color=color, zorder=1)
+    ax.tick_params(axis="both", which="major", labelsize=9, colors=colors["text"])
+    ax.tick_params(axis="both", which="minor", labelsize=8, colors=colors["text"])
+    ax.xaxis.set_minor_locator(plt.LogLocator(base=10.0, subs=np.arange(2, 10) * 0.1))
+    for spine in ax.spines.values():
+        spine.set_color(colors["grid"])
+    plt.tight_layout()
+    return fig