Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import random | |
| import pandas as pd | |
| import os | |
| from utils.data_loader import get_random_example | |
| from utils.models import generate_summaries, model_names | |
| from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html | |
| from utils.leaderboard import load_leaderboard_data, save_leaderboard_data | |
| # Read CSS from file | |
| css_path = os.path.join(os.getcwd(), 'static', 'styles.css') | |
| with open(css_path, 'r') as f: | |
| css_content = f.read() | |
| # Feedback options | |
| feedback_options = { | |
| "left": ["Model A: More complete", "Model A: More accurate", "Model A: More relevant", "Model A: Better written", "Model A: Better refusal (if applicable)"], | |
| "right": ["Model B: More complete", "Model B: More accurate", "Model B: More relevant", "Model B: Better written", "Model B: Better refusal (if applicable)"], | |
| "tie": ["Both complete", "Both accurate", "Both well written", "Both handle refusal well (if applicable)"], | |
| "neither": ["Both incomplete", "Both hallucinate", "Both irrelevant", "Both incorrectly refuse (if applicable)", "A is bad", "B is bad"] | |
| } | |
| def load_new_question_improved(agg_results=None, show_full=False): | |
| """Loads a new random question, contexts, and model summaries.""" | |
| if agg_results is None: | |
| agg_results = load_leaderboard_data() | |
| example = get_random_example() | |
| m_a_name, m_b_name = random.sample(model_names, 2) | |
| s_a, s_b = generate_summaries(example, m_a_name, m_b_name) | |
| context_desc = example.get('processed_context_desc', '') | |
| if context_desc: | |
| context_desc = f"<div class='context-topic'><span class='topic-label'>The question and context are about:</span> {context_desc}</div>" | |
| show_full = False | |
| context_html = get_context_html(example, show_full=show_full) | |
| return [ | |
| example, # current_example | |
| m_a_name, # model_a_name | |
| m_b_name, # model_b_name | |
| s_a, # summary_a_text | |
| s_b, # summary_b_text | |
| None, # selected_winner | |
| [], # feedback_list | |
| False, # show_results_state | |
| agg_results, # results_agg | |
| show_full, # show_full_context | |
| gr.update(value=example['question']), # query_display | |
| gr.update(value=context_desc, visible=bool(context_desc)), # context_description | |
| gr.update(value=context_html), # context_display | |
| gr.update(value="Show Full Context", elem_classes=["context-toggle-button"]), # context_toggle_btn | |
| gr.update(value=s_a), # summary_a_display | |
| gr.update(value=s_b), # summary_b_display | |
| gr.update(interactive=True, elem_classes=["vote-button"]), # vote_button_a | |
| gr.update(interactive=True, elem_classes=["vote-button"]), # vote_button_b | |
| gr.update(interactive=True, elem_classes=["vote-button"]), # vote_button_tie | |
| gr.update(interactive=True, elem_classes=["vote-button", "vote-button-neither"]), # vote_button_neither | |
| gr.update(choices=[], value=[], interactive=False, visible=False), # feedback_checkboxes | |
| gr.update(visible=False), # feedback_section | |
| gr.update(interactive=False, visible=True), # submit_button | |
| gr.update(visible=False), # results_reveal_area | |
| gr.update(interactive=True), # random_question_btn | |
| gr.update(elem_classes=[]) # main_interface_area | |
| ] | |
| def select_vote_improved(winner_choice): | |
| """Handles vote button selections.""" | |
| feedback_choices = feedback_options.get(winner_choice, []) | |
| btn_a_classes = ["vote-button"] | |
| btn_b_classes = ["vote-button"] | |
| btn_tie_classes = ["vote-button"] | |
| btn_neither_classes = ["vote-button", "vote-button-neither"] | |
| if winner_choice == 'left': | |
| btn_a_classes.append("selected") | |
| elif winner_choice == 'right': | |
| btn_b_classes.append("selected") | |
| elif winner_choice == 'tie': | |
| btn_tie_classes.append("selected") | |
| elif winner_choice == 'neither': | |
| btn_neither_classes.append("selected") | |
| return [ | |
| winner_choice, # selected_winner | |
| gr.update(choices=feedback_choices, value=[], interactive=True, visible=True), # feedback_checkboxes | |
| gr.update(visible=True), # feedback_section | |
| gr.update(interactive=True), # submit_button | |
| gr.update(elem_classes=btn_a_classes), # vote_button_a | |
| gr.update(elem_classes=btn_b_classes), # vote_button_b | |
| gr.update(elem_classes=btn_tie_classes), # vote_button_tie | |
| gr.update(elem_classes=btn_neither_classes) # vote_button_neither | |
| ] | |
| def submit_vote_fixed(m_a, m_b, winner, feedback, current_results): | |
| """Processes vote submission and updates results.""" | |
| if winner is None: | |
| print("Warning: Submit called without a winner selected.") | |
| return {} | |
| updated_results = current_results.copy() | |
| models_involved = [m_a, m_b] | |
| for model in models_involved: | |
| if model not in updated_results["wins"]: | |
| updated_results["wins"][model] = 0 | |
| updated_results["losses"][model] = 0 | |
| updated_results["ties"][model] = 0 | |
| if winner == 'left': | |
| updated_results["wins"][m_a] = updated_results["wins"].get(m_a, 0) + 1 | |
| updated_results["losses"][m_b] = updated_results["losses"].get(m_b, 0) + 1 | |
| elif winner == 'right': | |
| updated_results["wins"][m_b] = updated_results["wins"].get(m_b, 0) + 1 | |
| updated_results["losses"][m_a] = updated_results["losses"].get(m_a, 0) + 1 | |
| elif winner == 'tie': | |
| updated_results["ties"][m_a] = updated_results["ties"].get(m_a, 0) + 1 | |
| updated_results["ties"][m_b] = updated_results["ties"].get(m_b, 0) + 1 | |
| updated_results["votes"] = updated_results.get("votes", 0) + 1 | |
| save_leaderboard_data(updated_results) | |
| # Prepare Results Table | |
| results_list = [] | |
| all_models = list(set(list(updated_results["wins"].keys()) + list(updated_results["losses"].keys()) + list(updated_results["ties"].keys()))) | |
| for model in sorted(all_models): | |
| wins = updated_results["wins"].get(model, 0) | |
| losses = updated_results["losses"].get(model, 0) | |
| ties = updated_results["ties"].get(model, 0) | |
| total_comparisons = wins + losses + ties | |
| win_rate = (wins + 0.5 * ties) / total_comparisons if total_comparisons > 0 else 0.0 | |
| results_list.append({ | |
| "Model": model, | |
| "Win Rate (%)": f"{win_rate:.1%}", | |
| "Wins": wins, | |
| "Losses": losses, | |
| "Ties": ties, | |
| "Comparisons": total_comparisons | |
| }) | |
| results_df = pd.DataFrame(results_list) | |
| if not results_df.empty: | |
| results_df['Win Rate Value'] = results_df['Win Rate (%)'].str.rstrip('%').astype('float') / 100.0 | |
| results_df = results_df.sort_values(by='Win Rate Value', ascending=False).drop(columns=['Win Rate Value']) | |
| return [ | |
| True, # show_results_state | |
| updated_results, # results_agg | |
| gr.update(interactive=False), # vote_button_a | |
| gr.update(interactive=False), # vote_button_b | |
| gr.update(interactive=False), # vote_button_tie | |
| gr.update(interactive=False), # vote_button_neither | |
| gr.update(interactive=False), # feedback_checkboxes | |
| gr.update(visible=True), # feedback_section | |
| gr.update(visible=False), # submit_button | |
| gr.update(visible=True), # results_reveal_area | |
| gr.update(interactive=False), # random_question_btn | |
| gr.update(value=results_df, visible=True), # results_table_display | |
| gr.update(elem_classes=["results-revealed"]), # main_interface_area | |
| gr.update(interactive=True), # context_toggle_btn | |
| gr.update(value=m_a), # model_a_reveal | |
| gr.update(value=m_b) # model_b_reveal | |
| ] | |
| # Create embedded CSS | |
| css_html = f"<style>{css_content}</style>" | |
| # Create Gradio interface | |
| with gr.Blocks(theme=gr.themes.Default( | |
| primary_hue=gr.themes.colors.orange, | |
| secondary_hue=gr.themes.colors.slate | |
| )) as demo: | |
| # Embed CSS directly in HTML | |
| gr.HTML(css_html) | |
| # State Variables | |
| current_example = gr.State({}) | |
| model_a_name = gr.State("") | |
| model_b_name = gr.State("") | |
| summary_a_text = gr.State("") | |
| summary_b_text = gr.State("") | |
| selected_winner = gr.State(None) | |
| feedback_list = gr.State([]) | |
| show_results_state = gr.State(False) | |
| results_agg = gr.State({"wins": {}, "losses": {}, "ties": {}, "votes": 0}) | |
| show_full_context = gr.State(False) | |
| # Create Tabs | |
| with gr.Tabs() as tabs: | |
| # Main Arena Tab | |
| with gr.TabItem("Arena", id="arena-tab"): | |
| # Main title and description | |
| gr.Markdown("# RAG Summarizer Arena") | |
| gr.Markdown("Compare summaries generated by different models based on the provided context and query. Select the better summary, or choose 'Tie' or 'Neither'. Your feedback helps evaluate model performance.") | |
| # Main container | |
| with gr.Column(elem_id="main-interface-area") as main_interface_area: | |
| # Query section | |
| with gr.Row(elem_id="query-title-row"): | |
| gr.Markdown("### Query", elem_classes="section-heading") | |
| with gr.Row(elem_id="query-container"): | |
| with gr.Row(elem_classes="query-box-row"): | |
| query_display = gr.Markdown(value="Loading question...", elem_classes="query-text") | |
| random_question_btn = gr.Button("🔄 Get Random Question", elem_classes="query-button") | |
| # Context description | |
| context_description = gr.Markdown("", elem_classes="context-description") | |
| # Context section | |
| with gr.Row(elem_id="context-header-row"): | |
| gr.Markdown("### Context Provided", elem_classes="context-title") | |
| context_toggle_btn = gr.Button("Show Full Context", elem_classes=["context-toggle-button"]) | |
| context_display = gr.HTML(value="Loading context...", label="Context Chunks") | |
| gr.Markdown("---") | |
| gr.Markdown("### Compare Summaries", elem_classes="section-heading") | |
| # Model summaries | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| with gr.Group(elem_classes=["summary-card", "summary-card-a"]): | |
| summary_a_display = gr.Textbox(label="Model A", lines=10, interactive=False, show_copy_button=True) | |
| with gr.Column(scale=1): | |
| with gr.Group(elem_classes=["summary-card", "summary-card-b"]): | |
| summary_b_display = gr.Textbox(label="Model B", lines=10, interactive=False, show_copy_button=True) | |
| # Voting section | |
| gr.Markdown("### Cast Your Vote", elem_classes="section-heading") | |
| with gr.Row(): | |
| vote_button_a = gr.Button("⬅️ Summary A is Better", elem_classes=["vote-button"]) | |
| vote_button_tie = gr.Button("🤝 Tie / Equally Good", elem_classes=["vote-button"]) | |
| vote_button_b = gr.Button("➡️ Summary B is Better", elem_classes=["vote-button"]) | |
| vote_button_neither = gr.Button("❌ Neither is Adequate", elem_classes=["vote-button", "vote-button-neither"]) | |
| # Feedback section | |
| with gr.Group(elem_classes=["feedback-section"], visible=False) as feedback_section: | |
| feedback_checkboxes = gr.CheckboxGroup(label="Feedback (optional)", choices=[], interactive=False) | |
| # Submit button | |
| submit_button = gr.Button("Submit Vote", variant="primary", interactive=False, elem_id="submit-button") | |
| # Results area | |
| with gr.Column(visible=False) as results_reveal_area: | |
| gr.Markdown("---") | |
| gr.Markdown("### ✅ Vote Submitted!", elem_classes="section-heading") | |
| # Model reveal section | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Model A was actually:", elem_classes="section-heading") | |
| model_a_reveal = gr.Markdown("", elem_classes="model-reveal model-a-reveal") | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Model B was actually:", elem_classes="section-heading") | |
| model_b_reveal = gr.Markdown("", elem_classes="model-reveal model-b-reveal") | |
| gr.HTML("<div style='height: 10px;'></div>") | |
| # Try another button | |
| with gr.Row(elem_classes=["control-buttons"]): | |
| try_another_btn = gr.Button("🔄 Try Another Question", elem_id="try-another-btn") | |
| # Leaderboard Tab | |
| with gr.TabItem("Leaderboard", id="leaderboard-tab"): | |
| gr.Markdown("# Model Performance Leaderboard") | |
| gr.Markdown("View aggregate performance statistics for all models. The table below shows win rates, wins, losses, and ties for each model based on all evaluations.") | |
| results_table_display = gr.DataFrame(label="Model Performance", interactive=False, wrap=True) | |
| # Event Listeners | |
| context_toggle_btn.click( | |
| fn=toggle_context_display, | |
| inputs=[current_example, show_full_context], | |
| outputs=[show_full_context, context_display, context_toggle_btn] | |
| ) | |
| demo.load( | |
| fn=load_new_question_improved, | |
| inputs=[], | |
| outputs=[ | |
| current_example, model_a_name, model_b_name, summary_a_text, summary_b_text, | |
| selected_winner, feedback_list, show_results_state, results_agg, show_full_context, | |
| query_display, context_description, context_display, context_toggle_btn, | |
| summary_a_display, summary_b_display, | |
| vote_button_a, vote_button_b, vote_button_tie, vote_button_neither, | |
| feedback_checkboxes, feedback_section, submit_button, results_reveal_area, random_question_btn, | |
| main_interface_area | |
| ] | |
| ) | |
| random_question_btn.click( | |
| fn=load_new_question_improved, | |
| inputs=[], | |
| outputs=[ | |
| current_example, model_a_name, model_b_name, summary_a_text, summary_b_text, | |
| selected_winner, feedback_list, show_results_state, results_agg, show_full_context, | |
| query_display, context_description, context_display, context_toggle_btn, | |
| summary_a_display, summary_b_display, | |
| vote_button_a, vote_button_b, vote_button_tie, vote_button_neither, | |
| feedback_checkboxes, feedback_section, submit_button, results_reveal_area, random_question_btn, | |
| main_interface_area | |
| ] | |
| ) | |
| vote_button_a.click( | |
| fn=lambda: select_vote_improved('left'), | |
| inputs=None, | |
| outputs=[selected_winner, feedback_checkboxes, feedback_section, submit_button, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither] | |
| ) | |
| vote_button_b.click( | |
| fn=lambda: select_vote_improved('right'), | |
| inputs=None, | |
| outputs=[selected_winner, feedback_checkboxes, feedback_section, submit_button, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither] | |
| ) | |
| vote_button_tie.click( | |
| fn=lambda: select_vote_improved('tie'), | |
| inputs=None, | |
| outputs=[selected_winner, feedback_checkboxes, feedback_section, submit_button, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither] | |
| ) | |
| vote_button_neither.click( | |
| fn=lambda: select_vote_improved('neither'), | |
| inputs=None, | |
| outputs=[selected_winner, feedback_checkboxes, feedback_section, submit_button, vote_button_a, vote_button_b, vote_button_tie, vote_button_neither] | |
| ) | |
| feedback_checkboxes.change( | |
| fn=update_feedback, | |
| inputs=[feedback_checkboxes], | |
| outputs=[feedback_list] | |
| ) | |
| submit_button.click( | |
| fn=submit_vote_fixed, | |
| inputs=[model_a_name, model_b_name, selected_winner, feedback_list, results_agg], | |
| outputs=[ | |
| show_results_state, results_agg, | |
| vote_button_a, vote_button_b, vote_button_tie, vote_button_neither, | |
| feedback_checkboxes, | |
| feedback_section, | |
| submit_button, | |
| results_reveal_area, | |
| random_question_btn, | |
| results_table_display, | |
| main_interface_area, | |
| context_toggle_btn, | |
| model_a_reveal, | |
| model_b_reveal | |
| ] | |
| ) | |
| try_another_btn.click( | |
| fn=load_new_question_improved, | |
| inputs=[], | |
| outputs=[ | |
| current_example, model_a_name, model_b_name, summary_a_text, summary_b_text, | |
| selected_winner, feedback_list, show_results_state, results_agg, show_full_context, | |
| query_display, context_description, context_display, context_toggle_btn, | |
| summary_a_display, summary_b_display, | |
| vote_button_a, vote_button_b, vote_button_tie, vote_button_neither, | |
| feedback_checkboxes, feedback_section, submit_button, results_reveal_area, random_question_btn, | |
| main_interface_area | |
| ] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(debug=True) |