SLM-RAG-Arena

Running on Zero

App Files Files Community

SLM-RAG-Arena / app.py

oliver-aizip

refresh and persistent logging

15dd199 6 months ago

raw

history blame

21.5 kB

	import gradio as gr
	import random
	import pandas as pd
	import os
	import threading
	import time
	from utils.data_loader import get_random_example
	from utils.models import generate_summaries, model_names
	from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html
	from utils.leaderboard import load_leaderboard_data, submit_vote_with_elo, generate_leaderboard_html
	from utils.vote_logger import save_vote_details
	from utils.shared import generation_interrupt # Import from shared module

	# Feedback options for different voting outcomes
	feedback_options = {
	"left": ["Model A: More complete", "Model A: More accurate", "Model A: More relevant", "Model A: Better written", "Model A: Better refusal (if applicable)"],
	"right": ["Model B: More complete", "Model B: More accurate", "Model B: More relevant", "Model B: Better written", "Model B: Better refusal (if applicable)"],
	"tie": ["Model A: More complete", "Model A: More accurate", "Model A: More relevant", "Model A: Better written", "Model A: Better refusal (if applicable)",
	"Model B: More complete", "Model B: More accurate", "Model B: More relevant", "Model B: Better written", "Model B: Better refusal (if applicable)"],
	"neither": ["Model A: Incomplete", "Model A: Hallucinate", "Model A: Irrelevant", "Model A: Incorrect refusal (if applicable)",
	"Model B: Incomplete", "Model B: Hallucinate", "Model B: Irrelevant", "Model B: Incorrect refusal (if applicable)"]
	}

	def load_context(set_interrupt=False):
	"""
	Load a new question and context

	Parameters:
	- set_interrupt: If True, will interrupt any ongoing inference before loading
	"""
	if set_interrupt:
	# Interrupt any ongoing inference
	generation_interrupt.set()
	time.sleep(0.2) # Short delay to allow threads to detect interrupt

	# Always clear the flag before starting new work
	generation_interrupt.clear()
	example = get_random_example()

	# Format the context description
	context_desc = example.get('processed_context_desc', '')
	if context_desc:
	context_desc = f"<div class='context-topic'><span class='topic-label'>The question and context are about:</span> {context_desc}</div>"

	show_full = False
	context_html = get_context_html(example, show_full=show_full)

	return [
	example,
	gr.update(value=example['question']),
	gr.update(value=context_desc, visible=bool(context_desc)),
	gr.update(value=context_html),
	gr.update(value="Show Full Context", elem_classes=["context-toggle-button"]),
	show_full
	]

	def load_leaderboard():
	"""Loads and displays the leaderboard data"""
	results = load_leaderboard_data()
	leaderboard_html = generate_leaderboard_html(results)
	return leaderboard_html

	def generate_model_summaries(example):
	"""Run model inference"""

	result = {
	"model_a": "",
	"model_b": "",
	"summary_a": "",
	"summary_b": "",
	"completed": False
	}

	if generation_interrupt.is_set():
	return result

	try:
	m_a_name, m_b_name = random.sample(model_names, 2)

	# Track the partial completion state
	result["model_a"] = m_a_name
	result["model_b"] = m_b_name

	s_a, s_b = generate_summaries(example, m_a_name, m_b_name)

	if not generation_interrupt.is_set():
	result["summary_a"] = s_a
	result["summary_b"] = s_b
	result["completed"] = bool(s_a and s_b) # Only mark complete if both have content
	except Exception as e:
	print(f"Error in generation: {e}")

	return result

	def process_generation_result(result):
	"""Process the results from the generation function"""
	if not result["completed"] or not result["summary_a"] or not result["summary_b"]:
	# Either generation was interrupted or both summaries aren't ready
	return [
	result.get("model_a", ""),
	result.get("model_b", ""),
	result.get("summary_a", ""),
	result.get("summary_b", ""),
	None, [], False, load_leaderboard_data(),
	gr.update(value=result.get("summary_a", "Generation was interrupted or failed.")),
	gr.update(value=result.get("summary_b", "Generation was interrupted or failed.")),
	gr.update(interactive=False, elem_classes=["vote-button"]), # Explicitly disable
	gr.update(interactive=False, elem_classes=["vote-button"]),
	gr.update(interactive=False, elem_classes=["vote-button"]),
	gr.update(interactive=False, elem_classes=["vote-button", "vote-button-neither"]),
	gr.update(choices=[], value=[], interactive=False, visible=False),
	gr.update(visible=False),
	gr.update(interactive=False, visible=True),
	gr.update(visible=False),
	gr.update(interactive=True),
	gr.update(elem_classes=[])
	]

	# Only enable voting when both summaries are complete and non-empty
	buttons_interactive = bool(result["summary_a"] and result["summary_b"])

	# Generation completed successfully
	agg_results = load_leaderboard_data()
	return [
	result["model_a"], result["model_b"],
	result["summary_a"], result["summary_b"],
	None, [], False, agg_results,
	gr.update(value=result["summary_a"]),
	gr.update(value=result["summary_b"]),
	gr.update(interactive=buttons_interactive, elem_classes=["vote-button"]),
	gr.update(interactive=buttons_interactive, elem_classes=["vote-button"]),
	gr.update(interactive=buttons_interactive, elem_classes=["vote-button"]),
	gr.update(interactive=buttons_interactive, elem_classes=["vote-button", "vote-button-neither"]),
	gr.update(choices=[], value=[], interactive=False, visible=False),
	gr.update(visible=False),
	gr.update(interactive=False, visible=True),
	gr.update(visible=False),
	gr.update(interactive=True),
	gr.update(elem_classes=[])
	]

	def process_example(example):
	result = generate_model_summaries(example)
	return process_generation_result(result)

	def select_vote_improved(winner_choice):
	"""Updates UI based on vote selection"""
	feedback_choices = feedback_options.get(winner_choice, [])

	btn_a_classes = ["vote-button"]
	btn_b_classes = ["vote-button"]
	btn_tie_classes = ["vote-button"]
	btn_neither_classes = ["vote-button", "vote-button-neither"]

	if winner_choice == 'left':
	btn_a_classes.append("selected")
	elif winner_choice == 'right':
	btn_b_classes.append("selected")
	elif winner_choice == 'tie':
	btn_tie_classes.append("selected")
	elif winner_choice == 'neither':
	btn_neither_classes.append("selected")

	return [
	winner_choice,
	gr.update(choices=feedback_choices, value=[], interactive=True, visible=True),
	gr.update(visible=True),
	gr.update(interactive=True),
	gr.update(elem_classes=btn_a_classes),
	gr.update(elem_classes=btn_b_classes),
	gr.update(elem_classes=btn_tie_classes),
	gr.update(elem_classes=btn_neither_classes)
	]

	def handle_vote_submission(example, m_a, m_b, winner, feedback, summary_a, summary_b, current_results):
	"""Handle vote submission - logs details and updates leaderboard"""
	if winner is None:
	print("Warning: Submit called without a winner selected.")
	return {}

	# Save detailed vote information
	save_vote_details(example, m_a, m_b, winner, feedback, summary_a, summary_b)

	# Update Elo ratings and get UI updates
	return submit_vote_with_elo(m_a, m_b, winner, feedback, current_results)

	def show_loading_state():
	"""Show loading state while fetching new content"""
	return [
	gr.update(value="Loading new question and summaries...", interactive=False),
	gr.update(value="Loading new question and summaries...", interactive=False),
	gr.update(interactive=False), # For vote_button_a
	gr.update(interactive=False), # For vote_button_b
	gr.update(interactive=False), # For vote_button_tie
	gr.update(interactive=False) # For vote_button_neither
	]

	def handle_new_example_click():
	"""Handle clicking 'Get new example' button"""
	# Use the centralized approach - set_interrupt=True tells load_context to handle interruption
	return load_context(set_interrupt=True)[0]

	def update_ui_for_new_context(example):
	"""Update UI with new context information"""
	# Format the context description
	context_desc = example.get('processed_context_desc', '')
	if context_desc:
	context_desc = f"<div class='context-topic'><span class='topic-label'>The question and context are about:</span> {context_desc}</div>"

	return [
	gr.update(value=example['question']),
	gr.update(value=context_desc, visible=bool(context_desc)),
	gr.update(value=get_context_html(example, False)),
	gr.update(value="Show Full Context", elem_classes=["context-toggle-button"]),
	False
	]

	# Resource cleanup function for unload event
	def cleanup_on_disconnect():
	"""Clean up resources when browser disconnects"""
	print(f"Browser disconnected. Cleaning up resources...")
	generation_interrupt.set()
	# No need for time.sleep here as this is just setting the flag
	# Threads will detect it on their next check

	# Create Gradio interface
	with gr.Blocks(theme=gr.themes.Default(
	primary_hue=gr.themes.colors.orange,
	secondary_hue=gr.themes.colors.slate
	)) as demo:
	# Load CSS
	css_path = os.path.join(os.getcwd(), 'static', 'styles.css')

	# Load the CSS file
	with open(css_path, 'r') as f:
	css_content = f.read()

	# Create HTML components with CSS
	gr.HTML(f"<style>{css_content}</style>")

	# Add JavaScript to handle browser unload events
	unload_js = """
	<script>
	// This runs when the page is about to be closed or refreshed
	window.addEventListener('beforeunload', function(e) {
	// Send a synchronous request to the server
	navigator.sendBeacon('/cleanup?session_id=' + window.gradioClientState.session_hash);
	});
	</script>
	"""
	gr.HTML(unload_js)

	# State Variables
	current_example = gr.State({})
	model_a_name = gr.State("")
	model_b_name = gr.State("")
	summary_a_text = gr.State("")
	summary_b_text = gr.State("")
	selected_winner = gr.State(None)
	feedback_list = gr.State([])
	show_results_state = gr.State(False)
	results_agg = gr.State(load_leaderboard_data())
	show_full_context = gr.State(False)

	# Create Tabs
	with gr.Tabs() as tabs:
	# Main Arena Tab
	with gr.TabItem("Arena", id="arena-tab"):
	gr.Markdown("# RAG SLM Summarizer/Generator Arena")
	gr.Markdown("""
	1️⃣ Review the query and examine the highlighted context (✨ highlights contain key information! )\n
	2️⃣ Compare answers generated by two different models side-by-side\n
	3️⃣ Vote for the better response or select 'Tie/Neither' if appropriate""")

	gr.HTML("<hr>")

	# Main container
	with gr.Column(elem_id="main-interface-area") as main_interface_area:
	# Query section
	with gr.Row(elem_id="query-title-row"):
	gr.Markdown("### 💬 Query (What Users Want to Ask About the Doc)", elem_classes="section-heading")

	with gr.Row(elem_id="query-container"):
	with gr.Row(elem_classes="query-box-row"):
	query_display = gr.Markdown(value="Loading question...", elem_classes="query-text", elem_id="query-section")
	random_question_btn = gr.Button("🔄 Try a New Question", elem_classes="query-button")

	# Context description and display
	context_description = gr.Markdown("", elem_classes="context-description")

	gr.HTML("<hr>")

	with gr.Row(elem_id="context-header-row"):
	gr.Markdown("### 📋 Context (Relevant Information We Got from the Database)", elem_classes="context-title")
	context_toggle_btn = gr.Button("Show Full Context", elem_classes=["context-toggle-button"])

	context_display = gr.HTML(value="Loading context...", label="Context Chunks")

	gr.Markdown("---")
	gr.Markdown("### 🔍 Compare Answers from Models", elem_classes="section-heading")

	# Model summaries - Add ID for JavaScript to target and disable autoscroll
	with gr.Row(elem_id="summary-containers"):
	with gr.Column(scale=1):
	with gr.Group(elem_classes=["summary-card", "summary-card-a"]):
	summary_a_display = gr.Textbox(
	label="Model A",
	lines=10,
	interactive=False,
	show_copy_button=True,
	autoscroll=False, # Disable auto-scrolling
	elem_id="summary-a-display"
	)
	with gr.Column(scale=1):
	with gr.Group(elem_classes=["summary-card", "summary-card-b"]):
	summary_b_display = gr.Textbox(
	label="Model B",
	lines=10,
	interactive=False,
	show_copy_button=True,
	autoscroll=False, # Disable auto-scrolling
	elem_id="summary-b-display"
	)

	gr.HTML("<hr>")

	# Voting section
	gr.Markdown("### 🏅 Cast Your Vote", elem_classes="section-heading")
	with gr.Row():
	vote_button_a = gr.Button("⬅️ Summary A is Better", elem_classes=["vote-button"], interactive=False)
	vote_button_tie = gr.Button("🤝 Tie / Equally Good", elem_classes=["vote-button"], interactive=False)
	vote_button_b = gr.Button("➡️ Summary B is Better", elem_classes=["vote-button"], interactive=False)
	vote_button_neither = gr.Button("❌ Neither is Good", elem_classes=["vote-button", "vote-button-neither"], interactive=False)

	# Feedback and Submit sections
	with gr.Group(elem_classes=["feedback-section"], visible=False) as feedback_section:
	feedback_checkboxes = gr.CheckboxGroup(label="Feedback (optional)", choices=[], interactive=False)
	submit_button = gr.Button("Submit Your Vote", variant="primary", interactive=False, elem_id="submit-button")

	# Results area
	with gr.Column(visible=False) as results_reveal_area:
	gr.Markdown("---")
	gr.Markdown("### ✅ Vote Submitted!", elem_classes="section-heading")

	# Model reveal section
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### Model A was:", elem_classes="section-heading")
	model_a_reveal = gr.Markdown("", elem_classes="model-reveal model-a-reveal")
	with gr.Column(scale=1):
	gr.Markdown("### Model B was:", elem_classes="section-heading")
	model_b_reveal = gr.Markdown("", elem_classes="model-reveal model-b-reveal")

	gr.HTML("<hr>")

	# Try another button
	with gr.Row(elem_classes=["control-buttons"]):
	try_another_btn = gr.Button("🔄 Try Another Question", elem_id="try-another-btn")

	# Leaderboard Tab
	with gr.TabItem("Leaderboard", id="leaderboard-tab"):
	gr.Markdown("# RAG SLM Summarizer/Generator Leaderboard", elem_classes="orange-title")
	gr.Markdown("View performance statistics for all models ranked by Elo rating.")

	with gr.Group(elem_id="leaderboard-info"):
	gr.Markdown("""### About Elo Ratings

	The Elo rating system provides a more accurate ranking than simple win rates:
	- All models start at 1500 points
	- Points are exchanged after each comparison based on the expected outcome
	- Beating a stronger model earns more points than beating a weaker one
	- The ± value shows the statistical confidence interval (95%)
	""")

	results_table_display = gr.HTML(label="Model Performance")

	# Event handling
	# Toggle context display
	context_toggle_btn.click(
	fn=toggle_context_display,
	inputs=[current_example, show_full_context],
	outputs=[show_full_context, context_display, context_toggle_btn]
	)

	# Initial loading - context first, then summaries
	# Uses load_context without interruption since it's the first load
	demo.load(
	fn=load_context, # Default is set_interrupt=False
	inputs=[],
	outputs=[current_example, query_display, context_description, context_display,
	context_toggle_btn, show_full_context]
	).then(
	fn=process_example,
	inputs=[current_example],
	outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text,
	selected_winner, feedback_list, show_results_state, results_agg,
	summary_a_display, summary_b_display, vote_button_a, vote_button_b,
	vote_button_tie, vote_button_neither, feedback_checkboxes, feedback_section,
	submit_button, results_reveal_area, random_question_btn, main_interface_area]
	)

	# Load leaderboard content on app start
	demo.load(
	fn=load_leaderboard,
	inputs=[],
	outputs=[results_table_display]
	)

	# Use a single event chain for each button, structured to update UI first, then run inference
	for btn in [random_question_btn, try_another_btn]:
	btn.click(
	# Step 1: Show loading state immediately
	fn=show_loading_state,
	inputs=[],
	outputs=[summary_a_display, summary_b_display, vote_button_a,
	vote_button_b, vote_button_tie, vote_button_neither]
	).then(
	# Step 2: Get new example
	fn=handle_new_example_click,
	inputs=[],
	outputs=[current_example]
	).then(
	# Step 3: Update context UI immediately
	fn=update_ui_for_new_context,
	inputs=[current_example],
	outputs=[query_display, context_description, context_display,
	context_toggle_btn, show_full_context]
	).then(
	# Step 4: Then process example for model outputs
	fn=process_example,
	inputs=[current_example],
	outputs=[model_a_name, model_b_name, summary_a_text, summary_b_text,
	selected_winner, feedback_list, show_results_state, results_agg,
	summary_a_display, summary_b_display, vote_button_a, vote_button_b,
	vote_button_tie, vote_button_neither, feedback_checkboxes, feedback_section,
	submit_button, results_reveal_area, random_question_btn, main_interface_area]
	)

	# Vote button handlers
	for btn, choice in zip(
	[vote_button_a, vote_button_b, vote_button_tie, vote_button_neither],
	['left', 'right', 'tie', 'neither']
	):
	btn.click(
	fn=lambda choice=choice: select_vote_improved(choice),
	inputs=None,
	outputs=[selected_winner, feedback_checkboxes, feedback_section, submit_button,
	vote_button_a, vote_button_b, vote_button_tie, vote_button_neither]
	)

	# Update feedback when checkboxes change
	feedback_checkboxes.change(
	fn=update_feedback,
	inputs=[feedback_checkboxes],
	outputs=[feedback_list]
	)

	# Process vote submission and reveal results
	submit_button.click(
	fn=handle_vote_submission,
	inputs=[current_example, model_a_name, model_b_name, selected_winner, feedback_list, summary_a_text, summary_b_text, results_agg],
	outputs=[show_results_state, results_agg, vote_button_a, vote_button_b,
	vote_button_tie, vote_button_neither, feedback_checkboxes,
	feedback_section, submit_button, results_reveal_area,
	random_question_btn, results_table_display, main_interface_area,
	context_toggle_btn, model_a_reveal, model_b_reveal]
	)

	# Refresh leaderboard when switching to the leaderboard tab
	tabs.select(
	fn=load_leaderboard,
	inputs=[],
	outputs=[results_table_display],
	api_name="refresh_leaderboard"
	)

	# Register unload event for browser disconnections
	demo.unload(cleanup_on_disconnect)

	if __name__ == "__main__":
	demo.launch(debug=True)