SLM-RAG-Arena

Running on Zero

App Files Files Community

Haoguang Cai commited on May 3

Commit

8a142a6

1 Parent(s): e2dea9b

add UI and data processing

Browse files

Files changed (10) hide show

app.py +354 -88
static/.DS_Store +0 -0
static/styles.css +640 -0
utils/__init__.py +2 -0
utils/arena_df.csv +0 -0
utils/context_processor.py +206 -0
utils/data_loader.py +162 -0
utils/leaderboard.py +76 -0
utils/models.py +41 -0
utils/ui_helpers.py +33 -0

app.py CHANGED Viewed

@@ -1,102 +1,368 @@
 import gradio as gr
 import random
-import json
 import os
-from datetime import datetime
-# This would be replaced with your actual SLM integration
-def generate_response(query, context, model_name):
-    """Placeholder function to generate response from an SLM"""
-    return f"This is a placeholder response from {model_name} based on query: {query} and context: {context}"
-def save_evaluation(query, context, model_a, model_b, response_a, response_b, preference):
-    """Save evaluation results to a JSON file"""
-    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    evaluation = {
-        "timestamp": timestamp,
-        "query": query,
-        "context": context,
-        "models": {
-            "left": model_a,
-            "right": model_b
-        },
-        "responses": {
-            "left": response_a,
-            "right": response_b
-        },
-        "preference": preference
-    }
-    # Create directory if it doesn't exist
-    os.makedirs("evaluations", exist_ok=True)
-    # Save to a file
-    with open(f"evaluations/eval_{timestamp.replace(' ', '_').replace(':', '-')}.json", "w") as f:
-        json.dump(evaluation, f, indent=2)
-    return "Evaluation saved successfully!"
-def process_query(query, context, model_a="SLM-A", model_b="SLM-B"):
-    """Process query and generate responses from two models"""
-    # Generate responses
-    response_a = generate_response(query, context, model_a)
-    response_b = generate_response(query, context, model_b)
-    # Randomly swap to avoid position bias
-    if random.random() > 0.5:
-        return response_a, response_b, model_a, model_b
-    else:
-        return response_b, response_a, model_b, model_a
-def submit_evaluation(query, context, response_left, response_right, preference, model_left, model_right):
-    """Submit and save the evaluation"""
-    if not preference:
-        return "Please select a preference before submitting."
-    save_evaluation(query, context, model_left, model_right, response_left, response_right, preference)
-    return "Thank you for your evaluation!"
-with gr.Blocks(title="SLM-RAG Arena") as app:
-    gr.Markdown("# SLM-RAG Arena")
-    gr.Markdown("Compare responses from different models for RAG tasks.")
-    with gr.Row():
-        with gr.Column():
-            query_input = gr.Textbox(label="Query", placeholder="Enter your query here...")
-            context_input = gr.Textbox(label="Context", placeholder="Enter context information here...", lines=5)
-            generate_btn = gr.Button("Generate Responses")
-    # Hidden state variables
-    model_left = gr.State("")
-    model_right = gr.State("")
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown("### Response A")
-            response_left = gr.Textbox(label="", lines=10, interactive=False)
-        with gr.Column():
-            gr.Markdown("### Response B")
-            response_right = gr.Textbox(label="", lines=10, interactive=False)
-    with gr.Row():
-        preference = gr.Radio(
-            choices=["Prefer Left", "Tie", "Prefer Right", "Neither"],
-            label="Which response do you prefer?"
-        )
-    submit_btn = gr.Button("Submit Evaluation")
-    result = gr.Textbox(label="Result")
-    generate_btn.click(
-        process_query,
-        inputs=[query_input, context_input],
-        outputs=[response_left, response_right, model_left, model_right]
     )
-    submit_btn.click(
-        submit_evaluation,
-        inputs=[query_input, context_input, response_left, response_right, preference, model_left, model_right],
-        outputs=[result]
     )
-app.launch()

 import gradio as gr
 import random
+import pandas as pd
 import os
+from utils.data_loader import get_random_example
+from utils.models import generate_summaries, model_names
+from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html
+from utils.leaderboard import load_leaderboard_data, save_leaderboard_data
+# Read CSS from file
+css_path = os.path.join(os.getcwd(), 'static', 'styles.css')
+with open(css_path, 'r') as f:
+    css_content = f.read()
+# Feedback options
+feedback_options = {
+    "left": ["Model A: More complete", "Model A: More accurate", "Model A: More relevant", "Model A: Better written", "Model A: Better refusal (if applicable)"],
+    "right": ["Model B: More complete", "Model B: More accurate", "Model B: More relevant", "Model B: Better written", "Model B: Better refusal (if applicable)"],
+    "tie": ["Both complete", "Both accurate", "Both well written", "Both handle refusal well (if applicable)"],
+    "neither": ["Both incomplete", "Both hallucinate", "Both irrelevant", "Both incorrectly refuse (if applicable)", "A is bad", "B is bad"]
+}
+def load_new_question_improved(agg_results=None, show_full=False):
+    """Loads a new random question, contexts, and model summaries."""
+    if agg_results is None:
+        agg_results = load_leaderboard_data()
+    example = get_random_example()
+    m_a_name, m_b_name = random.sample(model_names, 2)
+    s_a, s_b = generate_summaries(example, m_a_name, m_b_name)
+    context_desc = example.get('processed_context_desc', '')
+    if context_desc:
+        context_desc = f"<div class='context-topic'><span class='topic-label'>The question and context are about:</span> {context_desc}</div>"
+    show_full = False
+    context_html = get_context_html(example, show_full=show_full)
+    return [
+        example,  # current_example
+        m_a_name,  # model_a_name
+        m_b_name,  # model_b_name
+        s_a,  # summary_a_text
+        s_b,  # summary_b_text
+        None,  # selected_winner
+        [],  # feedback_list
+        False,  # show_results_state
+        agg_results,  # results_agg
+        show_full,  # show_full_context
+        gr.update(value=example['question']),  # query_display
+        gr.update(value=context_desc, visible=bool(context_desc)),  # context_description
+        gr.update(value=context_html),  # context_display
+        gr.update(value="Show Full Context", elem_classes=["context-toggle-button"]),  # context_toggle_btn
+        gr.update(value=s_a),  # summary_a_display
+        gr.update(value=s_b),  # summary_b_display
+        gr.update(interactive=True, elem_classes=["vote-button"]),  # vote_button_a
+        gr.update(interactive=True, elem_classes=["vote-button"]),  # vote_button_b
+        gr.update(interactive=True, elem_classes=["vote-button"]),  # vote_button_tie
+        gr.update(interactive=True, elem_classes=["vote-button", "vote-button-neither"]),  # vote_button_neither
+        gr.update(choices=[], value=[], interactive=False, visible=False),  # feedback_checkboxes
+        gr.update(visible=False),  # feedback_section
+        gr.update(interactive=False, visible=True),  # submit_button
+        gr.update(visible=False),  # results_reveal_area
+        gr.update(interactive=True),  # random_question_btn
+        gr.update(elem_classes=[])  # main_interface_area
+    ]
+def select_vote_improved(winner_choice):
+    """Handles vote button selections."""
+    feedback_choices = feedback_options.get(winner_choice, [])
+    btn_a_classes = ["vote-button"]
+    btn_b_classes = ["vote-button"]
+    btn_tie_classes = ["vote-button"]
+    btn_neither_classes = ["vote-button", "vote-button-neither"]
+    if winner_choice == 'left':
+        btn_a_classes.append("selected")
+    elif winner_choice == 'right':
+        btn_b_classes.append("selected")
+    elif winner_choice == 'tie':
+        btn_tie_classes.append("selected")
+    elif winner_choice == 'neither':
+        btn_neither_classes.append("selected")
+    return [
+        winner_choice,  # selected_winner
+        gr.update(choices=feedback_choices, value=[], interactive=True, visible=True),  # feedback_checkboxes
+        gr.update(visible=True),  # feedback_section
+        gr.update(interactive=True),  # submit_button
+        gr.update(elem_classes=btn_a_classes),  # vote_button_a
+        gr.update(elem_classes=btn_b_classes),  # vote_button_b
+        gr.update(elem_classes=btn_tie_classes),  # vote_button_tie
+        gr.update(elem_classes=btn_neither_classes)  # vote_button_neither
+    ]
+def submit_vote_fixed(m_a, m_b, winner, feedback, current_results):
+    """Processes vote submission and updates results."""
+    if winner is None:
+        print("Warning: Submit called without a winner selected.")
+        return {}
+    updated_results = current_results.copy()
+    models_involved = [m_a, m_b]
+    for model in models_involved:
+         if model not in updated_results["wins"]:
+            updated_results["wins"][model] = 0
+            updated_results["losses"][model] = 0
+            updated_results["ties"][model] = 0
+    if winner == 'left':
+        updated_results["wins"][m_a] = updated_results["wins"].get(m_a, 0) + 1
+        updated_results["losses"][m_b] = updated_results["losses"].get(m_b, 0) + 1
+    elif winner == 'right':
+        updated_results["wins"][m_b] = updated_results["wins"].get(m_b, 0) + 1
+        updated_results["losses"][m_a] = updated_results["losses"].get(m_a, 0) + 1
+    elif winner == 'tie':
+        updated_results["ties"][m_a] = updated_results["ties"].get(m_a, 0) + 1
+        updated_results["ties"][m_b] = updated_results["ties"].get(m_b, 0) + 1
+    updated_results["votes"] = updated_results.get("votes", 0) + 1
+    save_leaderboard_data(updated_results)
+    # Prepare Results Table
+    results_list = []
+    all_models = list(set(list(updated_results["wins"].keys()) + list(updated_results["losses"].keys()) + list(updated_results["ties"].keys())))
+    for model in sorted(all_models):
+        wins = updated_results["wins"].get(model, 0)
+        losses = updated_results["losses"].get(model, 0)
+        ties = updated_results["ties"].get(model, 0)
+        total_comparisons = wins + losses + ties
+        win_rate = (wins + 0.5 * ties) / total_comparisons if total_comparisons > 0 else 0.0
+        results_list.append({
+            "Model": model,
+            "Win Rate (%)": f"{win_rate:.1%}",
+            "Wins": wins,
+            "Losses": losses,
+            "Ties": ties,
+            "Comparisons": total_comparisons
+        })
+    results_df = pd.DataFrame(results_list)
+    if not results_df.empty:
+        results_df['Win Rate Value'] = results_df['Win Rate (%)'].str.rstrip('%').astype('float') / 100.0
+        results_df = results_df.sort_values(by='Win Rate Value', ascending=False).drop(columns=['Win Rate Value'])
+    return [
+        True,  # show_results_state
+        updated_results,  # results_agg
+        gr.update(interactive=False),  # vote_button_a
+        gr.update(interactive=False),  # vote_button_b
+        gr.update(interactive=False),  # vote_button_tie
+        gr.update(interactive=False),  # vote_button_neither
+        gr.update(interactive=False),  # feedback_checkboxes
+        gr.update(visible=True),  # feedback_section
+        gr.update(visible=False),  # submit_button
+        gr.update(visible=True),  # results_reveal_area
+        gr.update(interactive=False),  # random_question_btn
+        gr.update(value=results_df, visible=True),  # results_table_display
+        gr.update(elem_classes=["results-revealed"]),  # main_interface_area
+        gr.update(interactive=True),  # context_toggle_btn
+        gr.update(value=m_a),  # model_a_reveal
+        gr.update(value=m_b)  # model_b_reveal
+    ]
+# Create embedded CSS
+css_html = f"<style>{css_content}</style>"
+# Create Gradio interface
+with gr.Blocks(theme=gr.themes.Default(
+    primary_hue=gr.themes.colors.orange,
+    secondary_hue=gr.themes.colors.slate
+)) as demo:
+    # Embed CSS directly in HTML
+    gr.HTML(css_html)
+    # State Variables
+    current_example = gr.State({})
+    model_a_name = gr.State("")
+    model_b_name = gr.State("")
+    summary_a_text = gr.State("")
+    summary_b_text = gr.State("")
+    selected_winner = gr.State(None)
+    feedback_list = gr.State([])
+    show_results_state = gr.State(False)
+    results_agg = gr.State({"wins": {}, "losses": {}, "ties": {}, "votes": 0})
+    show_full_context = gr.State(False)
+    # Create Tabs
+    with gr.Tabs() as tabs:
+        # Main Arena Tab
+        with gr.TabItem("Arena", id="arena-tab"):
+            # Main title and description
+            gr.Markdown("# RAG Summarizer Arena")
+            gr.Markdown("Compare summaries generated by different models based on the provided context and query. Select the better summary, or choose 'Tie' or 'Neither'. Your feedback helps evaluate model performance.")
+            # Main container
+            with gr.Column(elem_id="main-interface-area") as main_interface_area:
+                # Query section
+                with gr.Row(elem_id="query-title-row"):
+                    gr.Markdown("### Query", elem_classes="section-heading")
+                with gr.Row(elem_id="query-container"):
+                    with gr.Row(elem_classes="query-box-row"):
+                        query_display = gr.Markdown(value="Loading question...", elem_classes="query-text")
+                    random_question_btn = gr.Button("🔄 Get Random Question", elem_classes="query-button")
+                # Context description
+                context_description = gr.Markdown("", elem_classes="context-description")
+                # Context section
+                with gr.Row(elem_id="context-header-row"):
+                    gr.Markdown("### Context Provided", elem_classes="context-title")
+                    context_toggle_btn = gr.Button("Show Full Context", elem_classes=["context-toggle-button"])
+                context_display = gr.HTML(value="Loading context...", label="Context Chunks")
+                gr.Markdown("---")
+                gr.Markdown("### Compare Summaries", elem_classes="section-heading")
+                # Model summaries
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        with gr.Group(elem_classes=["summary-card", "summary-card-a"]):
+                            summary_a_display = gr.Textbox(label="Model A", lines=10, interactive=False, show_copy_button=True)
+                    with gr.Column(scale=1):
+                        with gr.Group(elem_classes=["summary-card", "summary-card-b"]):
+                            summary_b_display = gr.Textbox(label="Model B", lines=10, interactive=False, show_copy_button=True)
+                # Voting section
+                gr.Markdown("### Cast Your Vote", elem_classes="section-heading")
+                with gr.Row():
+                    vote_button_a = gr.Button("⬅️ Summary A is Better", elem_classes=["vote-button"])
+                    vote_button_tie = gr.Button("🤝 Tie / Equally Good", elem_classes=["vote-button"])
+                    vote_button_b = gr.Button("➡️ Summary B is Better", elem_classes=["vote-button"])
+                    vote_button_neither = gr.Button("❌ Neither is Adequate", elem_classes=["vote-button", "vote-button-neither"])
+                # Feedback section
+                with gr.Group(elem_classes=["feedback-section"], visible=False) as feedback_section:
+                    feedback_checkboxes = gr.CheckboxGroup(label="Feedback (optional)", choices=[], interactive=False)
+                # Submit button
+                submit_button = gr.Button("Submit Vote", variant="primary", interactive=False, elem_id="submit-button")
+                # Results area
+                with gr.Column(visible=False) as results_reveal_area:
+                    gr.Markdown("---")
+                    gr.Markdown("### ✅ Vote Submitted!", elem_classes="section-heading")
+                    # Model reveal section
+                    with gr.Row():
+                        with gr.Column(scale=1):
+                            gr.Markdown("### Model A was actually:", elem_classes="section-heading")
+                            model_a_reveal = gr.Markdown("", elem_classes="model-reveal model-a-reveal")
+                        with gr.Column(scale=1):
+                            gr.Markdown("### Model B was actually:", elem_classes="section-heading")
+                            model_b_reveal = gr.Markdown("", elem_classes="model-reveal model-b-reveal")
+                    gr.HTML("<div style='height: 10px;'></div>")
+                    # Try another button
+                    with gr.Row(elem_classes=["control-buttons"]):
+                        try_another_btn = gr.Button("🔄 Try Another Question", elem_id="try-another-btn")
+        # Leaderboard Tab
+        with gr.TabItem("Leaderboard", id="leaderboard-tab"):
+            gr.Markdown("# Model Performance Leaderboard")
+            gr.Markdown("View aggregate performance statistics for all models. The table below shows win rates, wins, losses, and ties for each model based on all evaluations.")
+            results_table_display = gr.DataFrame(label="Model Performance", interactive=False, wrap=True)
+    # Event Listeners
+    context_toggle_btn.click(
+        fn=toggle_context_display,
+        inputs=[current_example, show_full_context],
+        outputs=[show_full_context, context_display, context_toggle_btn]
     )
+    demo.load(
+        fn=load_new_question_improved,
+        inputs=[],
+        outputs=[
+            current_example, model_a_name, model_b_name, summary_a_text, summary_b_text,
+            selected_winner, feedback_list, show_results_state, results_agg, show_full_context,
+            query_display, context_description, context_display, context_toggle_btn,
+            summary_a_display, summary_b_display,
+            vote_button_a, vote_button_b, vote_button_tie, vote_button_neither,
+            feedback_checkboxes, feedback_section, submit_button, results_reveal_area, random_question_btn,
+            main_interface_area
+        ]
+    )
+    random_question_btn.click(
+        fn=load_new_question_improved,
+        inputs=[],
+        outputs=[
+            current_example, model_a_name, model_b_name, summary_a_text, summary_b_text,
+            selected_winner, feedback_list, show_results_state, results_agg, show_full_context,
+            query_display, context_description, context_display, context_toggle_btn,
+            summary_a_display, summary_b_display,
+            vote_button_a, vote_button_b, vote_button_tie, vote_button_neither,
+            feedback_checkboxes, feedback_section, submit_button, results_reveal_area, random_question_btn,
+            main_interface_area
+        ]
+    )
+    vote_button_a.click(
+        fn=lambda: select_vote_improved('left'),
+        inputs=None,
+        outputs=[selected_winner, feedback_checkboxes, feedback_section, submit_button, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither]
+    )
+    vote_button_b.click(
+        fn=lambda: select_vote_improved('right'),
+        inputs=None,
+        outputs=[selected_winner, feedback_checkboxes, feedback_section, submit_button, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither]
+    )
+    vote_button_tie.click(
+        fn=lambda: select_vote_improved('tie'),
+        inputs=None,
+        outputs=[selected_winner, feedback_checkboxes, feedback_section, submit_button, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither]
+    )
+    vote_button_neither.click(
+        fn=lambda: select_vote_improved('neither'),
+        inputs=None,
+        outputs=[selected_winner, feedback_checkboxes, feedback_section, submit_button, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither]
+    )
+    feedback_checkboxes.change(
+        fn=update_feedback,
+        inputs=[feedback_checkboxes],
+        outputs=[feedback_list]
+    )
+    submit_button.click(
+        fn=submit_vote_fixed,
+        inputs=[model_a_name, model_b_name, selected_winner, feedback_list, results_agg],
+        outputs=[
+            show_results_state, results_agg,
+            vote_button_a, vote_button_b, vote_button_tie, vote_button_neither,
+            feedback_checkboxes,
+            feedback_section,
+            submit_button,
+            results_reveal_area,
+            random_question_btn,
+            results_table_display,
+            main_interface_area,
+            context_toggle_btn,
+            model_a_reveal,
+            model_b_reveal
+        ]
+    )
+    try_another_btn.click(
+        fn=load_new_question_improved,
+        inputs=[],
+        outputs=[
+            current_example, model_a_name, model_b_name, summary_a_text, summary_b_text,
+            selected_winner, feedback_list, show_results_state, results_agg, show_full_context,
+            query_display, context_description, context_display, context_toggle_btn,
+            summary_a_display, summary_b_display,
+            vote_button_a, vote_button_b, vote_button_tie, vote_button_neither,
+            feedback_checkboxes, feedback_section, submit_button, results_reveal_area, random_question_btn,
+            main_interface_area
+        ]
     )
+if __name__ == "__main__":
+    demo.launch(debug=True)

static/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

static/styles.css ADDED Viewed

	@@ -0,0 +1,640 @@

+/* Base styles */
+body, .gradio-container {
+    background-color: #ffffff;
+    font-size: 15px;
+    overflow-x: hidden !important;
+}
+/* Main color variables for a simpler, more subdued theme */
+:root {
+    --primary: #FF7D1E;         /* Main orange accent - used sparingly */
+    --primary-light: #FFF8F2;   /* Very subtle orange tint */
+    --primary-selected: #FFE8D5; /* More visible but still subtle orange for selections */
+    --accent: #6B7280;          /* Neutral gray for most UI elements */
+    --text-dark: #333333;       /* Dark text */
+    --text-medium: #666666;     /* Medium text */
+    --border-light: #E6E6E6;    /* Light border */
+    --background-light: #F9F9F9; /* Light background */
+    --highlight: #FFFBEB;       /* Subtle highlight color */
+    --model-a-color: #92B4F4;   /* Model A color (blue) */
+    --model-b-color: #F8ADA7;   /* Model B color (red) */
+}
+/* Tab styling */
+.tabs {
+    margin-top: 0 !important;
+}
+/* Style for tab buttons */
+.tab-nav {
+    background-color: var(--background-light) !important;
+    padding: 5px 10px !important;
+    border-radius: 8px 8px 0 0 !important;
+    border-bottom: 1px solid var(--border-light) !important;
+}
+.tab-nav button {
+    font-size: 1.1em !important;
+    font-weight: 600 !important;
+    padding: 10px 25px !important;
+    margin: 0 5px !important;
+    border-radius: 6px 6px 0 0 !important;
+    border: none !important;
+    background-color: transparent !important;
+    color: var(--text-medium) !important;
+    transition: all 0.3s ease !important;
+}
+.tab-nav button.selected {
+    background-color: white !important;
+    color: var(--primary) !important;
+    border-bottom: 2px solid var(--primary) !important;
+}
+.tab-nav button:hover:not(.selected) {
+    background-color: rgba(255,255,255,0.5) !important;
+    color: var(--text-dark) !important;
+}
+/* Tab content area */
+.tabitem {
+    border: none !important;
+    padding: 20px 10px !important;
+}
+/* Style the row containing the Query title */
+#query-title-row {
+    margin: 0 !important;
+    padding: 0 10px !important;
+    display: flex !important;
+    align-items: center !important;
+    overflow: hidden !important;
+    height: 40px !important;
+}
+#query-title-row h3 {
+    margin: 0 !important;
+    padding: 0 !important;
+    font-size: 1.2em !important;
+    font-weight: 600 !important;
+    line-height: 1.2 !important;
+    flex-grow: 0 !important;
+    flex-shrink: 0 !important;
+    white-space: nowrap !important;
+    overflow: visible !important;
+    color: var(--text-dark) !important;
+}
+/* New query container layout with button next to box */
+#query-container {
+    display: flex !important;
+    align-items: stretch !important;
+    gap: 10px !important;
+    margin: 0 10px 8px 10px !important;
+    overflow: visible !important;
+}
+/* Style the query box - optimized for long queries */
+.query-box-row {
+    background-color: #F0F7FF !important; /* Light blue background */
+    padding: 12px 15px !important;
+    border-radius: 6px !important;
+    border: 1px solid #D1E3F8 !important; /* Light blue border */
+    margin: 0 !important;
+    align-items: flex-start !important;
+    flex: 1 1 50% !important;
+    max-width: 50% !important;
+    overflow: visible !important;
+    display: flex !important;
+    min-height: 50px !important;
+    height: auto !important;
+}
+/* Context description styling - simple version */
+.context-description {
+    background-color: transparent !important;
+    padding: 0 15px !important;
+    margin: 0 0 15px 0 !important;
+    font-style: normal !important;
+    color: var(--text-medium) !important; /* Lighter text color */
+    font-size: 1.05em !important; /* Slightly larger */
+}
+.context-topic {
+    display: inline-flex !important;
+    align-items: center !important;
+    background-color: transparent !important; /* No background */
+    padding: 0 !important;
+    border-radius: 0 !important;
+    box-shadow: none !important;
+}
+.topic-label {
+    font-weight: 600 !important;
+    color: var(--text-medium) !important; /* Lighter text color */
+    margin-right: 6px !important;
+}
+/* Style the Get Random Question button */
+.query-button {
+    padding: 0 20px !important;
+    border-radius: 6px !important;
+    font-weight: 500 !important;
+    flex: 0 0 auto !important;
+    display: flex !important;
+    align-items: center !important;
+    justify-content: center !important;
+    background-color: var(--background-light) !important;
+    color: var(--text-medium) !important;
+    border: 1px solid var(--border-light) !important;
+    font-size: 0.95em !important;
+    min-height: 50px !important;
+    white-space: nowrap !important;
+    transition: all 0.2s ease !important;
+    box-shadow: 0 1px 2px rgba(0,0,0,0.05) !important;
+}
+.query-button:hover {
+    background-color: var(--primary-light) !important;
+    color: var(--primary) !important;
+    border-color: var(--primary) !important;
+}
+/* Context header row with title and toggle button */
+#context-header-row {
+    display: flex !important;
+    justify-content: space-between !important;
+    align-items: center !important;
+    margin-bottom: 8px !important;
+    padding: 0 10px !important;
+}
+/* Context title styling */
+.context-title {
+    margin: 0 !important;
+    padding: 0 !important;
+    font-size: 1.2em !important;
+    font-weight: 600 !important;
+    color: var(--text-dark) !important;
+}
+/* Style for the context toggle button */
+.context-toggle-button {
+    background-color: var(--background-light) !important;
+    color: var(--text-medium) !important;
+    padding: 5px 10px !important;
+    border-radius: 4px !important;
+    border: 1px solid var(--border-light) !important;
+    font-size: 0.85em !important;
+    font-weight: 500 !important;
+    cursor: pointer !important;
+    transition: all 0.2s ease !important;
+    margin: 0 !important;
+    height: 30px !important;
+    line-height: 1 !important;
+    width: auto !important;
+    min-width: 0 !important;
+    max-width: 150px !important;
+}
+.context-toggle-button:hover {
+    background-color: var(--primary-light) !important;
+    color: var(--primary) !important;
+    border-color: var(--primary) !important;
+}
+/* Style the Markdown component displaying the query text */
+.query-text {
+    padding: 0 !important;
+    margin: 0 !important;
+    background-color: transparent !important;
+    border: none !important;
+    overflow: visible !important;
+    width: 100% !important;
+}
+/* Style the actual query text */
+.query-text p {
+    font-size: 1.2em !important;
+    font-weight: 600 !important;
+    color: #2E5AAC !important; /* Blue for query text */
+    line-height: 1.4 !important;
+    margin: 0 !important;
+    padding: 0 !important;
+    background-color: transparent !important;
+    border: none !important;
+    overflow-wrap: break-word !important;
+    word-wrap: break-word !important;
+    word-break: normal !important;
+    hyphens: auto !important;
+    white-space: normal !important;
+}
+/* Container for context items */
+.context-items-container {
+    border-radius: 6px;
+    overflow: hidden;
+}
+/* Style for individual context items */
+.context-item {
+    border: 1px solid var(--border-light);
+    background-color: var(--background-light);
+    padding: 12px;
+    border-radius: 6px;
+    margin-bottom: 8px;
+    font-size: 1em;
+    line-height: 1.5;
+    box-shadow: 0 1px 2px rgba(0,0,0,0.03);
+}
+/* Style for primary context items */
+.primary-context {
+    border-left: 3px solid #FFF0F0 !important; /* Light red border */
+}
+/* Style for chunk headers */
+.chunk-header {
+    font-weight: 600;
+    color: #2E5AAC;
+    margin-bottom: 8px;
+    padding-bottom: 5px;
+    border-bottom: 1px solid #D1E3F8;
+}
+/* Style for highlighted text within context items */
+.highlight {
+    background-color: #FFECB3 !important;
+    padding: 0.1em 0.3em !important;
+    border-radius: 3px !important;
+    font-weight: 600 !important;
+    color: #664500 !important;
+}
+/* Markdown table styling */
+.md-table {
+    width: 100% !important;
+    border-collapse: collapse !important;
+    margin: 10px 0 !important;
+    font-size: 0.95em !important;
+}
+.md-table th {
+    background-color: #F0F7FF !important;
+    color: #2E5AAC !important;
+    font-weight: 600 !important;
+    text-align: left !important;
+    padding: 10px !important;
+    border: 1px solid #D1E3F8 !important;
+}
+.md-table td {
+    padding: 8px 10px !important;
+    border: 1px solid #E6E6E6 !important;
+    vertical-align: top !important;
+}
+.md-table tr:nth-child(even) {
+    background-color: #F9F9F9 !important;
+}
+.md-table tr:hover {
+    background-color: #F0F7FF !important;
+}
+/* Style for the insufficient context alert */
+.insufficient-alert {
+    border: 2px solid #f78989;
+    background-color: #fff0f0;
+    color: #b92020;
+    padding: 12px;
+    border-radius: 6px;
+    margin-bottom: 12px;
+    font-size: 1em;
+}
+.insufficient-alert strong {
+    display: block;
+    margin-bottom: 5px;
+    font-size: 1.05em;
+}
+.insufficient-alert p {
+    margin: 0;
+    font-size: 1em;
+}
+/* Style for section headings */
+.section-heading {
+    color: var(--text-dark) !important;
+    margin: 5px 0 2px 0 !important;
+    padding: 0 !important;
+    font-weight: 600 !important;
+    font-size: 1.2em !important;
+}
+/* Style the group displaying model summaries */
+.summary-card {
+    border: 1px solid var(--border-light);
+    padding: 12px !important;
+    border-radius: 6px;
+    height: 100%;
+    box-shadow: 0 1px 3px rgba(0,0,0,0.03);
+    background-color: var(--background-light) !important;
+}
+/* Apply specific background colors to summary cards */
+.summary-card-a {
+    border-left: 3px solid #92B4F4 !important; /* Lighter blue accent */
+}
+.summary-card-b {
+    border-left: 3px solid #F8ADA7 !important; /* Light red accent */
+}
+/* Style the Textbox itself inside the summary card */
+.summary-card textarea {
+    font-size: 1em !important;
+    line-height: 1.4 !important;
+    background-color: rgba(255,255,255,0.7) !important;
+}
+/* Style the Textbox label */
+.summary-card .gr-input-label {
+    display: block !important;
+    padding: 0 0 5px 0 !important;
+    margin: 0 !important;
+    font-size: 1.05em !important;
+    font-weight: 600 !important;
+    color: var(--text-dark) !important;
+}
+/* Style the voting buttons */
+.vote-button {
+    flex-grow: 1;
+    margin: 0 5px !important;
+    font-size: 1.05em !important;
+    padding: 12px 15px !important;
+    border-radius: 6px !important;
+    transition: all 0.2s ease !important;
+    background-color: var(--background-light) !important;
+    border: 1px solid var(--border-light) !important;
+    min-height: 50px !important;
+    font-weight: 500 !important;
+    color: var(--text-dark) !important;
+    margin-bottom: 5px !important;
+}
+/* Hover effect for A/B/Tie buttons */
+.vote-button:hover:not(.vote-button-neither) {
+    background-color: var(--primary-light) !important;
+    border-color: var(--primary) !important;
+    color: var(--primary) !important;
+}
+/* Hover effect for Neither button */
+.vote-button-neither:hover {
+    background-color: #fff0f0 !important;
+    border-color: #f78989 !important;
+    color: #b92020 !important;
+}
+/* Style for selected buttons with persistent selection state */
+.vote-button.selected:not(.vote-button-neither) {
+    border-width: 2px !important;
+    border-style: solid !important;
+    border-color: #FF7D1E !important;
+    background-color: #FFF2E6 !important;
+    color: #FF7D1E !important;
+    font-weight: 600 !important;
+    box-shadow: 0 1px 3px rgba(0,0,0,0.1) !important;
+}
+/* Special neither button styling when selected */
+.vote-button-neither.selected {
+    border-width: 2px !important;
+    border-style: solid !important;
+    border-color: #f78989 !important;
+    background-color: #fff0f0 !important;
+    color: #b92020 !important;
+    font-weight: 600 !important;
+    box-shadow: 0 1px 3px rgba(0,0,0,0.1) !important;
+}
+/* Ensure selection state persists when hovered */
+.vote-button.selected:hover:not(.vote-button-neither) {
+    border-color: #FF7D1E !important;
+    background-color: #FFF2E6 !important;
+    color: #FF7D1E !important;
+}
+/* Ensure neither selection state persists when hovered */
+.vote-button-neither.selected:hover {
+    border-color: #f78989 !important;
+    background-color: #fff0f0 !important;
+    color: #b92020 !important;
+}
+/* Style the feedback section */
+.feedback-section {
+    padding: 3px 0 !important;
+    background-color: transparent !important;
+    margin-top: 3px !important;
+    margin-bottom: 3px !important;
+    font-size: 1em;
+    border: none !important;
+    box-shadow: none !important;
+}
+/* Improved feedback checkbox styling */
+.feedback-section .gr-check-radio {
+    font-size: 1.05em !important;
+}
+.feedback-section .gr-check-radio span {
+    font-size: 1.05em !important;
+    color: var(--text-dark) !important;
+}
+/* Checkbox larger size and color customization */
+.feedback-section input[type="checkbox"] {
+    width: 18px !important;
+    height: 18px !important;
+    margin-right: 6px !important;
+}
+/* Make the checkbox checked color stronger */
+.feedback-section input[type="checkbox"]:checked {
+    accent-color: #FF8C38 !important;
+    border-color: #FF8C38 !important;
+    background-color: #FF8C38 !important;
+}
+/* Style for model reveals */
+.model-reveal {
+    font-size: 1.3em !important;
+    padding: 8px 0 !important;
+    text-align: center !important;
+    margin-top: 5px !important;
+    font-weight: 600 !important;
+    border-radius: 6px !important;
+}
+/* Style for model A reveal */
+.model-a-reveal {
+    background-color: #F0F7FF !important;
+}
+/* Style for model B reveal */
+.model-b-reveal {
+    background-color: #FFF0F0 !important;
+}
+/* Style the control buttons area */
+.control-buttons button {
+    margin: 0 10px !important;
+    font-size: 1em !important;
+    border-radius: 6px !important;
+    padding: 8px 16px !important;
+    transition: all 0.2s ease !important;
+}
+/* Make headings slightly larger */
+h3 {
+    font-size: 1.2em !important;
+    font-weight: 600 !important;
+    margin: 5px 0 2px 0 !important;
+    padding: 0 !important;
+    color: var(--text-dark) !important;
+}
+/* Adjust main title size */
+h1 {
+    font-size: 1.6em !important;
+    color: var(--primary) !important;
+    margin: 10px 0 5px 0 !important;
+    padding: 0 !important;
+}
+/* Adjust main description size */
+#main-interface-area > p:first-of-type {
+    font-size: 1em !important;
+    margin: 0 0 8px 0 !important;
+    padding: 0 !important;
+    line-height: 1.4 !important;
+    color: var(--text-medium) !important;
+}
+/* Adjust CheckboxGroup label/choices size */
+.feedback-section .gr-input-label {
+    font-size: 1.1em !important;
+    font-weight: 600 !important;
+    margin-bottom: 0.6em !important;
+    color: var(--text-dark) !important;
+}
+/* Adjust DataFrame font size */
+.gr-dataframe table {
+    font-size: 0.95em !important;
+    border-collapse: separate !important;
+    border-spacing: 0 !important;
+    border-radius: 6px !important;
+    overflow: hidden !important;
+}
+.gr-dataframe th, .gr-dataframe td {
+    padding: 8px 10px !important;
+    border: none !important;
+    border-bottom: 1px solid var(--border-light) !important;
+}
+.gr-dataframe th {
+    background-color: var(--background-light) !important;
+    color: var(--text-dark) !important;
+    font-weight: 600 !important;
+}
+/* Reduce space caused by Markdown wrappers */
+.gradio-container .prose {
+    line-height: 1.4 !important;
+    margin: 0 !important;
+    padding: 0 !important;
+}
+hr {
+    margin: 5px 0 !important;
+    border: none !important;
+    height: 1px !important;
+    background-color: var(--border-light) !important;
+}
+/* Fix for any scrollbar issues */
+.gradio-row {
+    overflow: visible !important;
+}
+/* Submit button styling */
+#submit-button {
+    background-color: var(--primary) !important;
+    color: white !important;
+    padding: 12px 30px !important;
+    border-radius: 6px !important;
+    font-weight: 600 !important;
+    font-size: 1.2em !important;
+    transition: all 0.2s ease !important;
+    box-shadow: 0 1px 2px rgba(0,0,0,0.08) !important;
+    border: none !important;
+    margin-top: 15px !important;
+}
+#submit-button:hover {
+    background-color: #E56E0F !important;
+    box-shadow: 0 2px 4px rgba(0,0,0,0.12) !important;
+}
+/* Try another button styling */
+#try-another-btn {
+    background-color: var(--primary) !important;
+    color: white !important;
+    padding: 10px 25px !important;
+    border-radius: 6px !important;
+    font-weight: 600 !important;
+    transition: all 0.2s ease !important;
+    box-shadow: 0 1px 2px rgba(0,0,0,0.08) !important;
+    border: none !important;
+}
+#try-another-btn:hover {
+    background-color: #E56E0F !important;
+    box-shadow: 0 2px 4px rgba(0,0,0,0.12) !important;
+}
+/* Reduce vertical spacing */
+.gradio-column > *, .gradio-row > * {
+    margin-top: 0 !important;
+    margin-bottom: 0 !important;
+    padding-top: 0 !important;
+    padding-bottom: 0 !important;
+}
+.gradio-markdown {
+    margin-top: 0 !important;
+    margin-bottom: 0 !important;
+    padding-top: 0 !important;
+    padding-bottom: 0 !important;
+}
+/* Reduce container padding */
+.gradio-container {
+    padding: 0 !important;
+}
+/* Custom compact spacing for specific sections */
+#main-interface-area > div {
+    margin-bottom: 4px !important;
+}
+/* Media query for responsive behavior on smaller screens */
+@media screen and (max-width: 768px) {
+    #query-container {
+        flex-direction: column !important;
+    }
+    .query-box-row {
+        flex: 1 1 100% !important;
+        max-width: 100% !important;
+        margin-bottom: 10px !important;
+    }
+    .query-button {
+        width: 100% !important;
+    }
+}

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Makes utils a proper Python package
2	+ # This file can be empty

utils/arena_df.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

utils/context_processor.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import re
+import json
+def debug_text(text, label="Text"):
+    """Helper function to debug text processing issues"""
+    print(f"\n--- DEBUG {label} ---")
+    print(f"Length: {len(text)}")
+    print(f"First 100 chars: {text[:100]}")
+    print(f"Contains highlight_start: {'[[highlight_start]]' in text}")
+    print(f"Contains start_highlight: {'[[start_highlight]]' in text}")
+    print("-------------------------\n")
+def clean_json_text(text):
+    """
+    Handle text that came from JSON and might have JSON escaping.
+    This handles the case of text like: "the sky isn\\'t falling"
+    """
+    # First attempt to clean JSON-style escapes
+    try:
+        # Try to treat the string as if it were a JSON string
+        if '\\' in text:
+            # Create a valid JSON string with the text as content
+            json_str = json.dumps({"text": text})
+            # Parse it back to get properly unescaped text
+            parsed = json.loads(json_str)
+            return parsed["text"]
+    except Exception:
+        # If that fails, continue with the original text
+        pass
+    return text
+def process_highlights(text):
+    """
+    Process highlight markers in text to create HTML highlighted text.
+    Handles both standard format and alternative format.
+    Also properly handles escaped quotes.
+    """
+    # Debug info
+    # debug_text(text, "Before processing")
+    # Clean JSON escaping
+    text = clean_json_text(text)
+    # Process highlight tags
+    pattern1 = r'\[\[highlight_start\]\](.*?)\[\[highlight_end\]\]'
+    replacement = r'<span class="highlight">\1</span>'
+    highlighted_text = re.sub(pattern1, replacement, text)
+    pattern2 = r'\[\[start_highlight\]\](.*?)\[\[end_highlight\]\]'
+    highlighted_text = re.sub(pattern2, replacement, highlighted_text)
+    # Debug info
+    # debug_text(highlighted_text, "After processing")
+    return highlighted_text
+def process_table_with_highlights(markdown_table):
+    """
+    Special function to process markdown tables with highlights.
+    Ensures the table structure is preserved while applying highlights.
+    """
+    # First, split the table into lines
+    lines = markdown_table.strip().split('\n')
+    processed_lines = []
+    for line in lines:
+        # Process highlights in each line
+        processed_line = process_highlights(line)
+        processed_lines.append(processed_line)
+    return convert_markdown_table_to_html('\n'.join(processed_lines))
+def convert_markdown_table_to_html(markdown_text):
+    """
+    Converts a markdown table to an HTML table.
+    """
+    # Clean JSON escaping
+    markdown_text = clean_json_text(markdown_text)
+    lines = markdown_text.strip().split('\n')
+    table_lines = [line for line in lines if line.strip().startswith('|')]
+    if len(table_lines) < 2:  # Need at least header and separator
+        return markdown_text  # Return original if not a proper table
+    html = '<table class="md-table">'
+    # Check if we have a header row
+    if len(table_lines) >= 2 and '---' in table_lines[1]:
+        # Process header
+        header_cells = table_lines[0].split('|')[1:-1] if table_lines[0].strip().endswith('|') else table_lines[0].split('|')[1:]
+        html += '<thead><tr>'
+        for cell in header_cells:
+            # Process highlights in the cell
+            processed_cell = process_highlights(cell.strip())
+            html += f'<th>{processed_cell}</th>'
+        html += '</tr></thead>'
+        # Process data rows (skip the separator row at index 1)
+        html += '<tbody>'
+        for line in table_lines[2:]:
+            if not line.strip():
+                continue
+            cells = line.split('|')[1:-1] if line.strip().endswith('|') else line.split('|')[1:]
+            html += '<tr>'
+            for cell in cells:
+                # Process highlights in the cell
+                processed_cell = process_highlights(cell.strip())
+                html += f'<td>{processed_cell}</td>'
+            html += '</tr>'
+        html += '</tbody>'
+    else:
+        # No header row, treat all rows as data
+        html += '<tbody>'
+        for line in table_lines:
+            if not line.strip():
+                continue
+            cells = line.split('|')[1:-1] if line.strip().endswith('|') else line.split('|')[1:]
+            html += '<tr>'
+            for cell in cells:
+                # Process highlights in the cell
+                processed_cell = process_highlights(cell.strip())
+                html += f'<td>{processed_cell}</td>'
+            html += '</tr>'
+        html += '</tbody>'
+    html += '</table>'
+    return html
+def get_context_html(example, show_full=False):
+    """
+    Formats the context chunks into an HTML string for display using specific CSS classes.
+    Includes an alert for insufficient context and applies highlighting.
+    Parameters:
+    - example: The example data containing contexts
+    - show_full: Boolean indicating whether to show full context
+    """
+    html = ""
+    # Add insufficient context warning if needed
+    if example.get("insufficient", False):
+        insufficient_reason = example.get("insufficient_reason", "")
+        reason_html = f"<p>{insufficient_reason}</p>" if insufficient_reason else "<p>The context may not contain enough information to fully answer the question, or the question might be ambiguous. Models should ideally indicate this limitation or refuse to answer.</p>"
+        html += f"""
+        <div class="insufficient-alert">
+            <strong>
+                <svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="vertical-align: middle; margin-right: 5px;">
+                    <path d="m21.73 18-8-14a2 2 0 0 0-3.48 0l-8 14A2 2 0 0 0 4 21h16a2 2 0 0 0 1.73-3Z"></path>
+                    <line x1="12" y1="9" x2="12" y2="13"></line>
+                    <line x1="12" y1="17" x2="12.01" y2="17"></line>
+                </svg>
+                Insufficient Context
+            </strong>
+            {reason_html}
+        </div>
+        """
+    # Create container div for all context items
+    html += '<div class="context-items-container">'
+    # Determine which context to display based on show_full flag
+    if show_full and "full_contexts" in example and example["full_contexts"]:
+        # If showing full context, create individual items for each chunk without headers
+        for context_item in example["full_contexts"]:
+            context_text = context_item.get('content', '')
+            # Check for markdown table format (both standard and newline format)
+            if '|' in context_text and ('\n|' in context_text or '\n-' in context_text):
+                # Process as a table
+                html += f'<div class="context-item">{process_table_with_highlights(context_text)}</div>'
+            else:
+                # Regular text content - process highlights
+                processed_text = process_highlights(context_text)
+                html += f'<div class="context-item">{processed_text}</div>'
+    else:
+        # Show the highlighted context items
+        if "contexts" in example and example["contexts"]:
+            for context_item in example["contexts"]:
+                chunk_num = context_item.get('chunk_num', '')
+                context_text = context_item.get('content', '')
+                is_primary = context_item.get('is_primary', False)
+                # Add appropriate class for primary chunks
+                extra_class = " primary-context" if is_primary else ""
+                # Check for markdown table format
+                if '|' in context_text and ('\n|' in context_text or '\n-' in context_text):
+                    # Process as a table
+                    html += f'<div class="context-item{extra_class}">{process_table_with_highlights(context_text)}</div>'
+                else:
+                    # Regular text with potential highlights
+                    processed_text = process_highlights(context_text)
+                    html += f'<div class="context-item{extra_class}">{processed_text}</div>'
+        else:
+            # If no contexts available, show a message
+            html += '<div class="context-item">No context available. Try toggling to full context view.</div>'
+    # Close the container div
+    html += '</div>'
+    return html

utils/data_loader.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import os
+import json
+import pandas as pd
+import random
+import re
+from .context_processor import process_highlights
+def load_arena_data():
+    """
+    Loads the arena data from the arena_df.csv file in the utils directory.
+    Returns the data in a format compatible with the application.
+    """
+    try:
+        # Define the path to the CSV file
+        csv_path = os.path.join('utils', 'arena_df.csv')
+        # Read the CSV file
+        df = pd.read_csv(csv_path)
+        print(f"Loaded arena data with {len(df)} examples")
+        return df
+    except Exception as e:
+        print(f"Error loading arena data: {e}")
+        # Return an empty DataFrame if file can't be loaded
+        return pd.DataFrame()
+def create_dummy_example():
+    """Creates a dummy example if no data is loaded"""
+    return {
+        "question": "Could not load questions from the dataset. Please check the data file.",
+        "processed_context_desc": "Error: Data not available",
+        "contexts": ["No context available"],
+        "full_context": "Error loading context data.",
+        "Answerable": False,
+        "insufficient": True
+    }
+def get_random_example():
+    """
+    Selects a random example from the loaded arena data.
+    Returns the example data in a format compatible with the application.
+    """
+    # Load the arena data
+    df = load_arena_data()
+    if df.empty:
+        # If no data is loaded, return a dummy example
+        return create_dummy_example()
+    # Select a random row
+    example = df.sample(1).iloc[0]
+    # Process the example data
+    processed_example = {
+        "question": example['question'],
+        "processed_context_desc": example.get('processed_context_desc', ''),
+        "Answerable": example.get('Answerable', True),  # Default to True unless specified otherwise
+        "insufficient": example.get('insufficient', False),
+        "insufficient_reason": example.get('insufficient_reason', '')
+    }
+    # Process contexts - for full context
+    try:
+        contexts_raw = example['contexts']
+        if isinstance(contexts_raw, str):
+            contexts = json.loads(contexts_raw)
+            # Store full contexts as individual items
+            full_contexts = []
+            if isinstance(contexts, list):
+                for i, chunk in enumerate(contexts):
+                    if isinstance(chunk, dict) and 'content' in chunk:
+                        full_contexts.append({
+                            'chunk_num': i + 1,
+                            'content': chunk.get('content', '')
+                        })
+            processed_example["full_contexts"] = full_contexts
+        else:
+            processed_example["full_contexts"] = []
+    except Exception as e:
+        print(f"Error processing contexts: {e}")
+        processed_example["full_contexts"] = []
+    # Process highlighted contexts for display
+    contexts_highlighted = []
+    try:
+        # Check if contexts_highlighted exists
+        if 'contexts_highlighted' in example and example['contexts_highlighted']:
+            highlighted_contexts = []
+            if isinstance(example['contexts_highlighted'], str):
+                try:
+                    # Try direct parsing, assuming it's a valid JSON array
+                    raw_str = example['contexts_highlighted']
+                    # First, manually parse the highlighted contexts using regex
+                    # This is a more robust approach for our specific format
+                    type_pattern = r'"type":\s*"(primary|secondary)"'
+                    content_pattern = r'"abbreviatedContent":\s*"([^"]*)"|"abbreviatedContent":\s*"([^"]*)'
+                    types = re.findall(type_pattern, raw_str)
+                    # Handle both regular quotes and escaped quotes in content
+                    raw_contents = re.findall(content_pattern, raw_str)
+                    # Extract contents from tuple matches (the regex has capture groups)
+                    contents = []
+                    for match in raw_contents:
+                        # Get the non-empty string from the tuple
+                        content = next((s for s in match if s), "")
+                        contents.append(content)
+                    # Create the highlighted contexts from extracted data
+                    for i, (ctx_type, content) in enumerate(zip(types, contents)):
+                        highlighted_contexts.append({
+                            'type': ctx_type,
+                            'abbreviatedContent': content
+                        })
+                except Exception as e:
+                    print(f"Error extracting contexts with regex: {e}")
+            else:
+                # Already an object, not a string
+                highlighted_contexts = example['contexts_highlighted']
+            # Process each context item
+            for i, item in enumerate(highlighted_contexts):
+                if isinstance(item, dict):
+                    ctx_type = item.get('type', 'secondary')
+                    content = item.get('abbreviatedContent', '')
+                    # Process highlights using the standard format
+                    content = process_highlights(content)
+                    contexts_highlighted.append({
+                        'chunk_num': i + 1,
+                        'content': content,
+                        'is_primary': ctx_type == 'primary'
+                    })
+    except Exception as e:
+        print(f"Error processing highlighted contexts: {e}")
+    # If we couldn't process the highlighted contexts, fall back to the full contexts
+    if not contexts_highlighted and processed_example["full_contexts"]:
+        for i, ctx in enumerate(processed_example["full_contexts"]):
+            contexts_highlighted.append({
+                'chunk_num': i + 1,
+                'content': ctx.get('content', ''),
+                'is_primary': False
+            })
+    processed_example["contexts"] = contexts_highlighted
+    return processed_example
+def get_random_example_and_models(model_names):
+    """
+    Selects a random example from the arena data and assigns two distinct
+    random models to positions A and B.
+    """
+    example = get_random_example()
+    # Choose two different models from the model list
+    model_a_name, model_b_name = random.sample(model_names, 2)
+    return example, model_a_name, model_b_name

utils/leaderboard.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import os
+import pandas as pd
+import random
+from .models import model_names
+def load_leaderboard_data():
+    """
+    Loads the leaderboard data from the leaderboard CSV file.
+    Returns the data in a format compatible with the application.
+    """
+    # Initialize the results structure
+    results = {"wins": {}, "losses": {}, "ties": {}, "votes": 0}
+    try:
+        # Define the path to the CSV file for leaderboard
+        csv_path = os.path.join('utils', 'arena_df_leaderboard.csv')
+        # Check if the file exists and load it
+        if os.path.exists(csv_path):
+            df = pd.read_csv(csv_path)
+            # Process the data into our structure
+            for _, row in df.iterrows():
+                model = row['model']
+                results["wins"][model] = row['wins']
+                results["losses"][model] = row['losses']
+                results["ties"][model] = row['ties']
+            # Calculate total votes
+            for model in results["wins"].keys():
+                results["votes"] += results["wins"][model] + results["losses"][model] + results["ties"][model] // 2
+        else:
+            # If file doesn't exist, pre-populate with some data
+            for model in model_names:
+                results["wins"][model] = random.randint(0, 10)
+                results["losses"][model] = random.randint(0, 10)
+                results["ties"][model] = random.randint(0, 5)
+            # Calculate total votes
+            for model in model_names:
+                results["votes"] += results["wins"][model] + results["losses"][model] + results["ties"][model] // 2
+        return results
+    except Exception as e:
+        print(f"Error loading leaderboard data: {e}")
+        # Return the initialized structure if file can't be loaded
+        return results
+def save_leaderboard_data(results):
+    """
+    Saves the current leaderboard results back to the CSV file.
+    Parameters:
+    - results: The results dictionary containing wins, losses, ties, and votes
+    """
+    try:
+        # Define the path to the CSV file
+        csv_path = os.path.join('utils', 'arena_df_leaderboard.csv')
+        # Convert the results dictionary to a DataFrame
+        data = []
+        for model in results["wins"].keys():
+            data.append({
+                'model': model,
+                'wins': results["wins"].get(model, 0),
+                'losses': results["losses"].get(model, 0),
+                'ties': results["ties"].get(model, 0)
+            })
+        df = pd.DataFrame(data)
+        # Save to CSV
+        df.to_csv(csv_path, index=False)
+        print(f"Leaderboard data saved successfully to {csv_path}")
+    except Exception as e:
+        print(f"Error saving leaderboard data: {e}")

utils/models.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# --- Dummy Model Summaries ---
+# Define functions that simulate model summary generation
+dummy_models = {
+    "Model Alpha": lambda context, question, answerable: f"Alpha Summary: Based on the context for '{question[:20]}...', it appears the question is {'answerable' if answerable else 'unanswerable'}.",
+    "Model Beta": lambda context, question, answerable: f"Beta Summary: Regarding '{question[:20]}...', the provided documents {'allow' if answerable else 'do not allow'} for a conclusive answer based on the text.",
+    "Model Gamma": lambda context, question, answerable: f"Gamma Summary: For the question '{question[:20]}...', I {'can' if answerable else 'cannot'} provide a specific answer from the given text snippets.",
+    "Model Delta (Refusal Specialist)": lambda context, question, answerable: f"Delta Summary: The context for '{question[:20]}...' is {'sufficient' if answerable else 'insufficient'} to formulate a direct response. Therefore, I must refuse."
+}
+# List of model names for easy access
+model_names = list(dummy_models.keys())
+def generate_summaries(example, model_a_name, model_b_name):
+    """
+    Generates summaries for the given example using the assigned models.
+    """
+    # Create a plain text version of the contexts for the models
+    context_text = ""
+    if "contexts" in example and example["contexts"]:
+        context_parts = []
+        for ctx in example["contexts"]:
+            if isinstance(ctx, dict) and "content" in ctx:
+                context_parts.append(ctx["content"])
+        context_text = "\n---\n".join(context_parts)
+    else:
+        # Fallback to full contexts if highlighted contexts are not available
+        context_parts = []
+        if "full_contexts" in example:
+            for ctx in example["full_contexts"]:
+                if isinstance(ctx, dict) and "content" in ctx:
+                    context_parts.append(ctx["content"])
+            context_text = "\n---\n".join(context_parts)
+    # Pass 'Answerable' status to models (they might use it)
+    answerable = example.get("Answerable", True)
+    question = example.get("question", "")
+    # Call the dummy model functions
+    summary_a = dummy_models[model_a_name](context_text, question, answerable)
+    summary_b = dummy_models[model_b_name](context_text, question, answerable)
+    return summary_a, summary_b

utils/ui_helpers.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import gradio as gr
+from .context_processor import get_context_html
+def toggle_context_display(example, current_state):
+    """
+    Toggles between full context and highlights display.
+    Parameters:
+    - example: The current example data
+    - current_state: Boolean indicating if full context is already shown
+    Returns:
+    - Updated context HTML and toggle button text
+    """
+    new_state = not current_state
+    # UPDATED: Changed button text based on new state
+    button_text = "Show Highlights" if new_state else "Show Full Context"
+    context_html = get_context_html(example, show_full=new_state)
+    # Add or remove the showing-full class to the button
+    elem_classes = ["context-toggle-button"]
+    if new_state:
+        elem_classes.append("showing-full")
+    # Return the values as list in the expected order, not as a dictionary
+    return new_state, gr.update(value=context_html), gr.update(value=button_text, elem_classes=elem_classes)
+def update_feedback(choice):
+    """Updates the feedback list state when checkbox selections change."""
+    # Return the value directly, not as a dictionary
+    return choice