SLM-RAG-Arena

Running on Zero

SLM-RAG-Arena / utils /context_processor.py

Haoguang Cai

add UI and data processing

8a142a6 6 months ago

8.55 kB

	import re
	import json

	def debug_text(text, label="Text"):
	"""Helper function to debug text processing issues"""
	print(f"\n--- DEBUG {label} ---")
	print(f"Length: {len(text)}")
	print(f"First 100 chars: {text[:100]}")
	print(f"Contains highlight_start: {'[[highlight_start]]' in text}")
	print(f"Contains start_highlight: {'[[start_highlight]]' in text}")
	print("-------------------------\n")

	def clean_json_text(text):
	"""
	Handle text that came from JSON and might have JSON escaping.
	This handles the case of text like: "the sky isn\\'t falling"
	"""
	# First attempt to clean JSON-style escapes
	try:
	# Try to treat the string as if it were a JSON string
	if '\\' in text:
	# Create a valid JSON string with the text as content
	json_str = json.dumps({"text": text})
	# Parse it back to get properly unescaped text
	parsed = json.loads(json_str)
	return parsed["text"]
	except Exception:
	# If that fails, continue with the original text
	pass

	return text

	def process_highlights(text):
	"""
	Process highlight markers in text to create HTML highlighted text.
	Handles both standard format and alternative format.
	Also properly handles escaped quotes.
	"""
	# Debug info
	# debug_text(text, "Before processing")

	# Clean JSON escaping
	text = clean_json_text(text)

	# Process highlight tags
	pattern1 = r'\[\[highlight_start\]\](.*?)\[\[highlight_end\]\]'
	replacement = r'<span class="highlight">\1</span>'
	highlighted_text = re.sub(pattern1, replacement, text)

	pattern2 = r'\[\[start_highlight\]\](.*?)\[\[end_highlight\]\]'
	highlighted_text = re.sub(pattern2, replacement, highlighted_text)

	# Debug info
	# debug_text(highlighted_text, "After processing")

	return highlighted_text

	def process_table_with_highlights(markdown_table):
	"""
	Special function to process markdown tables with highlights.
	Ensures the table structure is preserved while applying highlights.
	"""
	# First, split the table into lines
	lines = markdown_table.strip().split('\n')
	processed_lines = []

	for line in lines:
	# Process highlights in each line
	processed_line = process_highlights(line)
	processed_lines.append(processed_line)

	return convert_markdown_table_to_html('\n'.join(processed_lines))

	def convert_markdown_table_to_html(markdown_text):
	"""
	Converts a markdown table to an HTML table.
	"""
	# Clean JSON escaping
	markdown_text = clean_json_text(markdown_text)

	lines = markdown_text.strip().split('\n')
	table_lines = [line for line in lines if line.strip().startswith('\|')]

	if len(table_lines) < 2: # Need at least header and separator
	return markdown_text # Return original if not a proper table

	html = '<table class="md-table">'

	# Check if we have a header row
	if len(table_lines) >= 2 and '---' in table_lines[1]:
	# Process header
	header_cells = table_lines[0].split('\|')[1:-1] if table_lines[0].strip().endswith('\|') else table_lines[0].split('\|')[1:]
	html += '<thead><tr>'
	for cell in header_cells:
	# Process highlights in the cell
	processed_cell = process_highlights(cell.strip())
	html += f'<th>{processed_cell}</th>'
	html += '</tr></thead>'

	# Process data rows (skip the separator row at index 1)
	html += '<tbody>'
	for line in table_lines[2:]:
	if not line.strip():
	continue

	cells = line.split('\|')[1:-1] if line.strip().endswith('\|') else line.split('\|')[1:]
	html += '<tr>'
	for cell in cells:
	# Process highlights in the cell
	processed_cell = process_highlights(cell.strip())
	html += f'<td>{processed_cell}</td>'
	html += '</tr>'
	html += '</tbody>'
	else:
	# No header row, treat all rows as data
	html += '<tbody>'
	for line in table_lines:
	if not line.strip():
	continue

	cells = line.split('\|')[1:-1] if line.strip().endswith('\|') else line.split('\|')[1:]
	html += '<tr>'
	for cell in cells:
	# Process highlights in the cell
	processed_cell = process_highlights(cell.strip())
	html += f'<td>{processed_cell}</td>'
	html += '</tr>'
	html += '</tbody>'

	html += '</table>'
	return html

	def get_context_html(example, show_full=False):
	"""
	Formats the context chunks into an HTML string for display using specific CSS classes.
	Includes an alert for insufficient context and applies highlighting.

	Parameters:
	- example: The example data containing contexts
	- show_full: Boolean indicating whether to show full context
	"""
	html = ""

	# Add insufficient context warning if needed
	if example.get("insufficient", False):
	insufficient_reason = example.get("insufficient_reason", "")
	reason_html = f"<p>{insufficient_reason}</p>" if insufficient_reason else "<p>The context may not contain enough information to fully answer the question, or the question might be ambiguous. Models should ideally indicate this limitation or refuse to answer.</p>"

	html += f"""
	<div class="insufficient-alert">
	<strong>
	<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="vertical-align: middle; margin-right: 5px;">
	<path d="m21.73 18-8-14a2 2 0 0 0-3.48 0l-8 14A2 2 0 0 0 4 21h16a2 2 0 0 0 1.73-3Z"></path>
	<line x1="12" y1="9" x2="12" y2="13"></line>
	<line x1="12" y1="17" x2="12.01" y2="17"></line>
	</svg>
	Insufficient Context
	</strong>
	{reason_html}
	</div>
	"""

	# Create container div for all context items
	html += '<div class="context-items-container">'

	# Determine which context to display based on show_full flag
	if show_full and "full_contexts" in example and example["full_contexts"]:
	# If showing full context, create individual items for each chunk without headers
	for context_item in example["full_contexts"]:
	context_text = context_item.get('content', '')

	# Check for markdown table format (both standard and newline format)
	if '\|' in context_text and ('\n\|' in context_text or '\n-' in context_text):
	# Process as a table
	html += f'<div class="context-item">{process_table_with_highlights(context_text)}</div>'
	else:
	# Regular text content - process highlights
	processed_text = process_highlights(context_text)
	html += f'<div class="context-item">{processed_text}</div>'
	else:
	# Show the highlighted context items
	if "contexts" in example and example["contexts"]:
	for context_item in example["contexts"]:
	chunk_num = context_item.get('chunk_num', '')
	context_text = context_item.get('content', '')
	is_primary = context_item.get('is_primary', False)

	# Add appropriate class for primary chunks
	extra_class = " primary-context" if is_primary else ""

	# Check for markdown table format
	if '\|' in context_text and ('\n\|' in context_text or '\n-' in context_text):
	# Process as a table
	html += f'<div class="context-item{extra_class}">{process_table_with_highlights(context_text)}</div>'
	else:
	# Regular text with potential highlights
	processed_text = process_highlights(context_text)
	html += f'<div class="context-item{extra_class}">{processed_text}</div>'
	else:
	# If no contexts available, show a message
	html += '<div class="context-item">No context available. Try toggling to full context view.</div>'

	# Close the container div
	html += '</div>'

	return html