Spaces:

julian-schelb
/

Loci-Similes-Demo

Sleeping

App Files Files Community

julian-schelb commited on 28 days ago

Commit

dc017a9

verified ·

1 Parent(s): 4b23b4b

Create results_page.py

Browse files

Files changed (1) hide show

results_page.py +362 -0

results_page.py ADDED Viewed

	@@ -0,0 +1,362 @@

+"""Results stage for the Loci Similes GUI."""
+from __future__ import annotations
+import csv
+import io
+import re
+from typing import TYPE_CHECKING
+try:
+    import gradio as gr
+except ImportError as exc:
+    missing = getattr(exc, "name", None)
+    base_msg = (
+        "Optional GUI dependencies are missing. Install them via "
+        "'pip install locisimiles[gui]' (Python 3.13+ also requires the "
+        "audioop-lts backport) to use the Gradio interface."
+    )
+    if missing and missing != "gradio":
+        raise ImportError(f"{base_msg} (missing package: {missing})") from exc
+    raise ImportError(base_msg) from exc
+if TYPE_CHECKING:
+    from locisimiles.document import Document, TextSegment
+import tempfile
+from typing import Any, Dict, List, Tuple
+try:
+    import gradio as gr
+except ImportError as exc:
+    raise ImportError("Gradio is required for results page") from exc
+from locisimiles.document import Document, TextSegment
+# Type aliases from pipeline
+FullDict = Dict[str, List[Tuple[TextSegment, float, float]]]
+def update_results_display(results: FullDict | None, query_doc: Document | None, threshold: float = 0.5) -> tuple[dict, dict, dict]:
+    """Update the results display with new data.
+    Args:
+        results: Pipeline results
+        query_doc: Query document
+        threshold: Classification probability threshold for counting finds
+    Returns:
+        Tuple of (query_segments_update, query_segments_state, matches_dict_state)
+    """
+    query_segments, matches_dict = _convert_results_to_display(results, query_doc, threshold)
+    return (
+        gr.update(value=query_segments),  # Update query segments dataframe
+        query_segments,                   # Update query segments state
+        matches_dict,                     # Update matches dict state
+    )
+def _format_metric_with_bar(value: float, is_above_threshold: bool = False) -> str:
+    """Format a metric value with a visual progress bar.
+    Args:
+        value: Metric value between 0 and 1
+        is_above_threshold: Whether to highlight this value
+    Returns:
+        HTML string with progress bar
+    """
+    percentage = int(value * 100)
+    # Choose color based on threshold
+    if is_above_threshold:
+        bar_color = "#6B9BD1"  # Blue accent for findings
+        bg_color = "#E3F2FD"   # Light blue background
+    else:
+        bar_color = "#B0B0B0"  # Gray for below threshold
+        bg_color = "#F5F5F5"   # Light gray background
+    html = f'''
+    <div style="display: flex; align-items: center; gap: 8px; width: 100%;">
+        <div style="flex: 1; background-color: {bg_color}; border-radius: 4px; overflow: hidden; height: 20px; position: relative;">
+            <div style="background-color: {bar_color}; width: {percentage}%; height: 100%; transition: width 0.3s;"></div>
+        </div>
+        <span style="min-width: 45px; text-align: right; font-weight: {'bold' if is_above_threshold else 'normal'};">{value:.3f}</span>
+    </div>
+    '''
+    return html
+def _convert_results_to_display(results: FullDict | None, query_doc: Document | None, threshold: float = 0.5) -> tuple[list[list], dict]:
+    """Convert pipeline results to display format.
+    Args:
+        results: Pipeline results (FullDict format)
+        query_doc: Query document
+        threshold: Classification probability threshold for counting finds
+    Returns:
+        Tuple of (query_segments_list, matches_dict)
+    """
+    if results is None or query_doc is None:
+        # Return empty data if no results
+        return [], {}
+    # First pass: Create raw matches dictionary and count finds
+    raw_matches = {}
+    find_counts = {}
+    for query_id, match_list in results.items():
+        # Sort by probability (descending) to show most likely matches first
+        sorted_matches = sorted(match_list, key=lambda x: x[2], reverse=True)  # x[2] is probability
+        # Store raw numeric values
+        raw_matches[query_id] = sorted_matches
+        # Count finds above threshold
+        find_counts[query_id] = sum(1 for _, _, prob in sorted_matches if prob >= threshold)
+    # Convert query document to list format with find counts
+    # Document is iterable and returns TextSegments in order
+    query_segments = []
+    for segment in query_doc:
+        find_count = find_counts.get(segment.id, 0)
+        query_segments.append([segment.id, segment.text, find_count])
+    # Second pass: Format matches with HTML progress bars
+    matches_dict = {}
+    for query_id, match_list in raw_matches.items():
+        matches_dict[query_id] = [
+            [
+                source_seg.id,
+                source_seg.text,
+                _format_metric_with_bar(round(similarity, 3), probability >= threshold),
+                _format_metric_with_bar(round(probability, 3), probability >= threshold)
+            ]
+            for source_seg, similarity, probability in match_list
+        ]
+    return query_segments, matches_dict
+def _on_query_select(evt: gr.SelectData, query_segments: list, matches_dict: dict) -> tuple[dict, dict]:
+    """Handle query segment selection and return matching source segments.
+    Note: evt.index[0] gives the row number when clicking anywhere in that row.
+    Args:
+        evt: Selection event data
+        query_segments: List of query segments
+        matches_dict: Dictionary mapping query IDs to matches
+    Returns:
+        A tuple of (prompt_visibility_update, dataframe_update_with_data)
+    """
+    if evt.index is None or len(evt.index) < 1:
+        return gr.update(visible=True), gr.update(visible=False)
+    row_index = evt.index[0]
+    if row_index >= len(query_segments):
+        return gr.update(visible=True), gr.update(visible=False)
+    segment_id = query_segments[row_index][0]
+    matches = matches_dict.get(segment_id, [])
+    # Hide prompt, show dataframe with results
+    return gr.update(visible=False), gr.update(value=matches, visible=True)
+def _extract_numeric_from_html(html_str: str) -> float:
+    """Extract numeric value from HTML formatted metric string.
+    Args:
+        html_str: HTML string with embedded numeric value
+    Returns:
+        Extracted numeric value
+    """
+    import re
+    # Extract the number from the span at the end: <span ...>0.XXX</span>
+    match = re.search(r'<span[^>]*>([\d.]+)</span>', html_str)
+    if match:
+        return float(match.group(1))
+    # Fallback: if it's already a number
+    try:
+        return float(html_str)
+    except (ValueError, TypeError):
+        return 0.0
+def _export_results_to_csv(query_segments: list, matches_dict: dict, threshold: float) -> str:
+    """Export results to a CSV file.
+    Args:
+        query_segments: List of query segments with find counts
+        matches_dict: Dictionary mapping query IDs to matches
+        threshold: Classification probability threshold
+    Returns:
+        Path to the temporary CSV file
+    """
+    # Create a temporary file
+    temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.csv', newline='', encoding='utf-8')
+    with temp_file as f:
+        writer = csv.writer(f)
+        # Write header
+        writer.writerow([
+            "Query_Segment_ID",
+            "Query_Text",
+            "Source_Segment_ID",
+            "Source_Text",
+            "Similarity",
+            "Probability",
+            "Above_Threshold"
+        ])
+        # Write data for each query segment
+        for query_row in query_segments:
+            query_id = query_row[0]
+            query_text = query_row[1]
+            # Get matches for this query segment
+            matches = matches_dict.get(query_id, [])
+            if matches:
+                for match in matches:
+                    source_id = match[0]
+                    source_text = match[1]
+                    # Extract numeric values from HTML formatted strings
+                    similarity = _extract_numeric_from_html(match[2]) if isinstance(match[2], str) else match[2]
+                    probability = _extract_numeric_from_html(match[3]) if isinstance(match[3], str) else match[3]
+                    above_threshold = "Yes" if probability >= threshold else "No"
+                    writer.writerow([
+                        query_id,
+                        query_text,
+                        source_id,
+                        source_text,
+                        similarity,
+                        probability,
+                        above_threshold
+                    ])
+            else:
+                # Write row even if no matches
+                writer.writerow([
+                    query_id,
+                    query_text,
+                    "",
+                    "",
+                    "",
+                    "",
+                    ""
+                ])
+    return temp_file.name
+def build_results_stage() -> tuple[gr.Step, dict[str, Any]]:
+    """Build the results stage UI.
+    Returns:
+        A tuple of (Step component, components_dict) where components_dict contains
+        references to all interactive components that need to be accessed later.
+    """
+    with gr.Step("Results", id=2) as step:
+        # State to hold current query segments and matches
+        query_segments_state = gr.State(value=[])
+        matches_dict_state = gr.State(value={})
+        gr.Markdown("### 📊 Step 3: View Results")
+        gr.Markdown(
+            "Select a query segment on the left to view potential intertextual references from the source document. "
+            "Similarity measures the cosine similarity between embeddings (0-1, higher = more similar). "
+            "Probability is the classifier's confidence that the pair represents an intertextual reference (0-1, higher = more likely)."
+        )
+        # Download button
+        with gr.Row():
+            download_btn = gr.DownloadButton("Download Results as CSV", variant="primary")
+        with gr.Row():
+            # Left column: Query segments
+            with gr.Column(scale=1):
+                gr.Markdown("### Query Document Segments")
+                query_segments = gr.Dataframe(
+                    value=[],
+                    headers=["Segment ID", "Text", "Finds"],
+                    interactive=False,
+                    show_label=False,
+                    label="Query Document Segments",
+                    wrap=True,
+                    max_height=600,
+                    col_count=(3, "fixed"),
+                )
+            # Right column: Matching source segments
+            with gr.Column(scale=1):
+                gr.Markdown("### Potential Intertextual References")
+                # Prompt shown initially
+                selection_prompt = gr.Markdown(
+                    """
+                    <div style="display: flex; align-items: center; justify-content: center; height: 400px; font-size: 18px; color: #666;">
+                        <div style="text-align: center;">
+                            <div style="font-size: 48px; margin-bottom: 20px;">←</div>
+                            <div>Select a query segment to view</div>
+                            <div>potential intertextual references</div>
+                        </div>
+                    </div>
+                    """,
+                    visible=True
+                )
+                # Dataframe hidden initially
+                source_matches = gr.Dataframe(
+                    headers=["Source ID", "Source Text", "Similarity", "Probability"],
+                    interactive=False,
+                    show_label=False,
+                    label="Potential Intertextual References from Source Document",
+                    wrap=True,
+                    max_height=600,
+                    visible=False,
+                    datatype=["str", "str", "html", "html"],  # Enable HTML rendering for metric columns
+                )
+        with gr.Row():
+            restart_btn = gr.Button("← Start Over", size="lg")
+    # Return the step and all components that need to be accessed
+    components = {
+        "query_segments": query_segments,
+        "query_segments_state": query_segments_state,
+        "matches_dict_state": matches_dict_state,
+        "source_matches": source_matches,
+        "selection_prompt": selection_prompt,
+        "download_btn": download_btn,
+        "restart_btn": restart_btn,
+    }
+    return step, components
+def setup_results_handlers(components: dict, walkthrough: gr.Walkthrough) -> None:
+    """Set up event handlers for the results stage.
+    Args:
+        components: Dictionary of UI components from build_results_stage
+        walkthrough: The Walkthrough component for navigation
+    """
+    # Selection handler for query segments
+    components["query_segments"].select(
+        fn=_on_query_select,
+        inputs=[components["query_segments_state"], components["matches_dict_state"]],
+        outputs=[components["selection_prompt"], components["source_matches"]],
+    )
+    # Restart button: Step 3 → Step 1
+    components["restart_btn"].click(
+        fn=lambda: gr.Walkthrough(selected=0),
+        outputs=walkthrough,
+    )