Spaces:

AnjaliSarawgi
/

gradio_app

Sleeping

App Files Files Community

AnjaliSarawgi commited on 29 days ago

Commit

47e7fc8

1 Parent(s): e940076

changes

Browse files

Files changed (1) hide show

app.py +216 -136

app.py CHANGED Viewed

@@ -1,11 +1,13 @@
 """
 Gradio application for performing OCR on scanned Old Nepali documents.
 This script is a Gradio port of a Streamlit application originally built
 to visualize and edit OCR output. It loads a pre‑trained model for
 sequence decoding, accepts an input image (and optional segmentation
 XML in ALTO format), performs OCR on segmented lines, highlights tokens
 with low confidence and offers downloads of both the raw text and per
 token scores.
 The heavy lifting functions (model loading, pre‑processing, inference
 and highlighting) are adapted directly from the Streamlit version. The
 UI has been simplified for Gradio: users upload an image and optional
@@ -13,9 +15,12 @@ XML file, choose preprocessing steps and a highlight metric, then run
 OCR.  The results are displayed alongside the overlaid segmentation
 boxes and a table of token scores.  An editable textbox lets users
 modify the predicted text before downloading it.
 To run this app locally, install gradio (`pip install gradio`) and
 execute this script with Python:
     python gradio_app.py
 """
 import io
@@ -88,6 +93,7 @@ FONT_PATH: str = os.path.join("NotoSansDevanagari-Regular.ttf")
 @lru_cache(maxsize=1)
 def load_model():
     """Load the OCR model, tokenizer and feature extractor.
     Returns
     -------
     model : VisionEncoderDecoderModel
@@ -116,6 +122,20 @@ def load_model():
 #
 def clean_text(text: str) -> str:
     text = unicodedata.normalize("NFC", text)
     text = CLEANUP.sub("", text)
     return re.sub(r"\s+", "", text)
@@ -123,12 +143,14 @@ def clean_text(text: str) -> str:
 def prepare_image(image: Image.Image, max_side: int = RESIZE_MAX_SIDE) -> Image.Image:
     """Resize the image so that its longest side equals max_side.
     Parameters
     ----------
     image : PIL.Image
         Input image.
     max_side : int, optional
         Maximum allowed size for the longest side of the image.
     Returns
     -------
     PIL.Image
@@ -151,6 +173,7 @@ def get_amp_ctx():
 #
 def parse_boxes_from_xml(xml_bytes: bytes, level: str = "line", image_size: tuple | None = None):
     """Parse ALTO or PAGE XML to extract bounding boxes.
     Parameters
     ----------
     xml_bytes : bytes
@@ -161,6 +184,7 @@ def parse_boxes_from_xml(xml_bytes: bytes, level: str = "line", image_size: tupl
         If provided, image_size=(width, height) allows rescaling
         coordinates to match the actual image.  ALTO files often store
         absolute page sizes that differ from the image dimensions.
     Returns
     -------
     list of dict
@@ -308,12 +332,14 @@ def sort_boxes_reading_order(boxes, y_tol: int = 10):
 def draw_boxes(img: Image.Image, boxes):
     """Overlay semi‑transparent red polygons or rectangles on an image.
     Parameters
     ----------
     img : PIL.Image
         The base image.
     boxes : list of dict
         Segmentation boxes with either 'points' or 'bbox' keys.
     Returns
     -------
     PIL.Image
@@ -343,19 +369,12 @@ def draw_boxes(img: Image.Image, boxes):
 # ----------------------------------------------------------------------
 # OCR inference per line
 #
-# def predict_and_score_once(image: Image.Image, line_id: int = 1, topk: int = TOPK):
-def predict_and_score_once(
-    image: Image.Image,
-    model,
-    tokenizer,
-    feature_extractor,
-    device,
-    line_id: int = 1,
-    topk: int = TOPK,
-):
     """Run the model on a single cropped line and return predictions and scores.
     This helper wraps the model.generate call to obtain per‑token
     probabilities and derives a DataFrame summarizing each decoding step.
     Parameters
     ----------
     image : PIL.Image
@@ -364,6 +383,7 @@ def predict_and_score_once(
         Identifier used in the output DataFrame.
     topk : int, optional
         Number of alternative tokens to keep for each decoding position.
     Returns
     -------
     decoded_text : str
@@ -373,7 +393,7 @@ def predict_and_score_once(
         columns: line_id, seq_pos, token_id, token, confidence,
         rel_prob, entropy, gap12, alt_tokens, alt_probs.
     """
-    # model, tokenizer, feature_extractor, device = load_model()
     img = prepare_image(image)
     pixel_values = feature_extractor(images=img, return_tensors="pt").pixel_values.to(device)
     amp_ctx = get_amp_ctx()
@@ -381,7 +401,7 @@ def predict_and_score_once(
         try:
             out = model.generate(
                 pixel_values,
-                max_length=100,
                 num_beams=5,
                 do_sample=False,
                 return_dict_in_generate=True,
@@ -394,7 +414,7 @@ def predict_and_score_once(
             if "out of memory" in str(e).lower():
                 out = model.generate(
                     pixel_values,
-                    max_length=100,
                     num_beams=1,
                     do_sample=False,
                     return_dict_in_generate=True,
@@ -510,15 +530,16 @@ def parse_alt_tokens(s: str):
     return [(t if t is not None else "") for t in (s or "").split("|")]
 def highlight_tokens_with_tooltips(
     line_text: str, df_tok: pd.DataFrame, red_threshold: float, metric_column: str
 ) -> str:
     """Insert HTML spans around tokens whose chosen metric exceeds threshold.
     The metric column can be "rel_prob" (relative probability) or
     "entropy".  Tokens with a value strictly greater than red_threshold
     will be wrapped in a span with a tooltip listing alternative
     predictions and their probabilities.
     Parameters
     ----------
     line_text : str
@@ -529,6 +550,7 @@ def highlight_tokens_with_tooltips(
         Values above this threshold will be highlighted.
     metric_column : str
         Column name in df_tok used for thresholding.
     Returns
     -------
     str
@@ -610,174 +632,232 @@ def _html_escape(s: str) -> str:
 # ----------------------------------------------------------------------
 # Main OCR wrapper for Gradio
 #
-import tempfile
-def run_ocr(image, xml_file, apply_gray, apply_bin, highlight_metric):
-    if image is None:
-        return None, "", None, "", None, None
     pil_img = Image.fromarray(image).convert("RGB")
     if apply_gray:
         pil_img = pil_img.convert("L").convert("RGB")
     if apply_bin:
         img_cv = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
         _, bin_img = cv2.threshold(img_cv, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
         pil_img = Image.fromarray(bin_img).convert("RGB")
     boxes = []
-    if xml_file:
-        if isinstance(xml_file, (bytes, bytearray)):
-            xml_bytes = bytes(xml_file)
-        elif isinstance(xml_file, str):
-            with open(xml_file, "rb") as f:
-                xml_bytes = f.read()
-        elif hasattr(xml_file, "read"):
-            xml_bytes = xml_file.read()
-        elif isinstance(xml_file, dict) and "data" in xml_file:
-            xml_bytes = xml_file.get("data")
-        else:
-            xml_bytes = None
-        if xml_bytes:
             boxes = parse_boxes_from_xml(xml_bytes, level="line", image_size=pil_img.size)
             boxes = sort_boxes_reading_order(boxes)[:MAX_LINES]
     dfs = []
-    parts = []
-    plain_lines = []
-    model, tokenizer, feature_extractor, device = load_model()
     if boxes:
         for idx, b in enumerate(boxes, 1):
-            x1, y1, x2, y2 = b["bbox"]
-            crop = pil_img.crop((x1, y1, x2, y2))
-            # seg_text, df_tok = predict_and_score_once(crop, line_id=idx, topk=TOPK)
-            seg_text, df_tok = predict_and_score_once(
-                crop, model, tokenizer, feature_extractor, device, line_id=idx, topk=TOPK
-            )
             seg_text = clean_text(seg_text)
-            plain_lines.append(seg_text)
             if highlight_metric == "Relative Probability":
-                seg_html = highlight_tokens_with_tooltips(seg_text, df_tok, REL_PROB_TH, "rel_prob")
             else:
-                seg_html = highlight_tokens_with_tooltips(seg_text, df_tok, 0.10, "entropy")
-            parts.append(seg_html)
             dfs.append(df_tok)
-        predicted_html = "<br>".join(parts)
         df_all = pd.concat(dfs, ignore_index=True)
     else:
-        # seg_text, df_all = predict_and_score_once(pil_img, line_id=1, topk=TOPK)
-        seg_text, df_all = predict_and_score_once(
-            pil_img, model, tokenizer, feature_extractor, device, line_id=1, topk=TOPK
-        )
         seg_text = clean_text(seg_text)
         if highlight_metric == "Relative Probability":
-            seg_html = highlight_tokens_with_tooltips(seg_text, df_all, REL_PROB_TH, "rel_prob")
         else:
-            seg_html = highlight_tokens_with_tooltips(seg_text, df_all, 0.10, "entropy")
-        predicted_html = seg_html
     overlay_img = draw_boxes(pil_img, boxes) if boxes else pil_img
-    # Clean text for editing (strip HTML)
-    # clean_pred_text = re.sub(r"<[^>]+>", "", predicted_html)
-    # clean_pred_text = re.sub(r"<[^>]+>", "", predicted_html)
-    # clean_pred_text = clean_pred_text.replace("<br>", "\n").strip()
-    clean_pred_text = "\n".join(plain_lines)
-    # Save outputs to temporary files
-    tmp_dir = tempfile.mkdtemp()
-    txt_path = os.path.join(tmp_dir, "ocr_prediction.txt")
-    # csv_path = os.path.join(tmp_dir, "token_scores.csv")
-    with open(txt_path, "w", encoding="utf-8") as f:
-        f.write(clean_pred_text)
-    # if df_all is not None and not df_all.empty:
-        # df_all.to_csv(csv_path, index=False, encoding="utf-8")
-    # return overlay_img, predicted_html, df_all, clean_pred_text, txt_path, csv_path
-    return overlay_img, predicted_html, clean_pred_text, txt_path
 # ----------------------------------------------------------------------
 # Build Gradio Interface
 #
 def create_gradio_interface():
     with gr.Blocks(title="Old Nepali HTR") as demo:
-        gr.Markdown("""
-        # Old Nepali HTR (Gradio)
-        Upload a scanned image and (optionally) a segmentation XML file.
-        Choose preprocessing steps and a highlight metric, then click **Run OCR**.
-        """)
         with gr.Row():
             image_input = gr.Image(type="numpy", label="Upload Image")
-            xml_input = gr.File(label="Upload XML (optional)", type="binary")
         with gr.Row():
             apply_gray_checkbox = gr.Checkbox(label="Convert to Grayscale", value=False)
             apply_bin_checkbox = gr.Checkbox(label="Binarize", value=False)
         run_btn = gr.Button("Run OCR")
-        with gr.Row():
-            overlay_output = gr.Image(label="Detected Regions")
-            predictions_output = gr.HTML(
-                label="Predictions",
-                container=True,
-                elem_classes=["predictions-box"]
-            )
-        # Add subtle border to Predictions
-        gr.HTML("""
-        <style>
-        .predictions-box {
-            border: 1px solid #d0d0d0;
-            border-radius: 8px;
-            padding: 12px;
-            background-color: #fafafa;
-            min-height: 200px;
-            overflow-y: auto;
-        }
-        </style>
-        """)
-        editable_text = gr.Textbox(label="Edit Recognized Text", lines=8, interactive=True)
-        with gr.Row():
-            download_text = gr.File(label="Download Raw Text (.txt)")
-            download_edited = gr.File(label="Download Edited Text (.txt)")
-        # Run OCR and populate results
         run_btn.click(
-            fn=run_ocr,
-            inputs=[image_input, xml_input, apply_gray_checkbox, apply_bin_checkbox],
-            outputs=[
-                overlay_output,
-                predictions_output,
-                editable_text,
-                download_text,
-            ],
         )
-        # Save edited text dynamically
-        def save_edited_text(text):
-            import tempfile, os
-            tmp_dir = tempfile.mkdtemp()
-            path = os.path.join(tmp_dir, "edited_ocr_text.txt")
-            with open(path, "w", encoding="utf-8") as f:
-                f.write(text)
-            return path
-        editable_text.change(fn=save_edited_text, inputs=editable_text, outputs=download_edited)
     return demo

 """
 Gradio application for performing OCR on scanned Old Nepali documents.
 This script is a Gradio port of a Streamlit application originally built
 to visualize and edit OCR output. It loads a pre‑trained model for
 sequence decoding, accepts an input image (and optional segmentation
 XML in ALTO format), performs OCR on segmented lines, highlights tokens
 with low confidence and offers downloads of both the raw text and per
 token scores.
 The heavy lifting functions (model loading, pre‑processing, inference
 and highlighting) are adapted directly from the Streamlit version. The
 UI has been simplified for Gradio: users upload an image and optional
 OCR.  The results are displayed alongside the overlaid segmentation
 boxes and a table of token scores.  An editable textbox lets users
 modify the predicted text before downloading it.
 To run this app locally, install gradio (`pip install gradio`) and
 execute this script with Python:
     python gradio_app.py
 """
 import io
 @lru_cache(maxsize=1)
 def load_model():
     """Load the OCR model, tokenizer and feature extractor.
     Returns
     -------
     model : VisionEncoderDecoderModel
 #
 def clean_text(text: str) -> str:
+    """Normalize and collapse whitespace from a decoded string.
+    Parameters
+    ----------
+    text : str
+        The raw decoded string from the model.
+    Returns
+    -------
+    str
+        The cleaned string with Unicode normalization and whitespace
+        removed.  All whitespace characters are stripped since the
+        predictions are later tokenized at the akshara (syllable) level.
+    """
     text = unicodedata.normalize("NFC", text)
     text = CLEANUP.sub("", text)
     return re.sub(r"\s+", "", text)
 def prepare_image(image: Image.Image, max_side: int = RESIZE_MAX_SIDE) -> Image.Image:
     """Resize the image so that its longest side equals max_side.
     Parameters
     ----------
     image : PIL.Image
         Input image.
     max_side : int, optional
         Maximum allowed size for the longest side of the image.
     Returns
     -------
     PIL.Image
 #
 def parse_boxes_from_xml(xml_bytes: bytes, level: str = "line", image_size: tuple | None = None):
     """Parse ALTO or PAGE XML to extract bounding boxes.
     Parameters
     ----------
     xml_bytes : bytes
         If provided, image_size=(width, height) allows rescaling
         coordinates to match the actual image.  ALTO files often store
         absolute page sizes that differ from the image dimensions.
     Returns
     -------
     list of dict
 def draw_boxes(img: Image.Image, boxes):
     """Overlay semi‑transparent red polygons or rectangles on an image.
     Parameters
     ----------
     img : PIL.Image
         The base image.
     boxes : list of dict
         Segmentation boxes with either 'points' or 'bbox' keys.
     Returns
     -------
     PIL.Image
 # ----------------------------------------------------------------------
 # OCR inference per line
 #
+def predict_and_score_once(image: Image.Image, line_id: int = 1, topk: int = TOPK):
     """Run the model on a single cropped line and return predictions and scores.
     This helper wraps the model.generate call to obtain per‑token
     probabilities and derives a DataFrame summarizing each decoding step.
     Parameters
     ----------
     image : PIL.Image
         Identifier used in the output DataFrame.
     topk : int, optional
         Number of alternative tokens to keep for each decoding position.
     Returns
     -------
     decoded_text : str
         columns: line_id, seq_pos, token_id, token, confidence,
         rel_prob, entropy, gap12, alt_tokens, alt_probs.
     """
+    model, tokenizer, feature_extractor, device = load_model()
     img = prepare_image(image)
     pixel_values = feature_extractor(images=img, return_tensors="pt").pixel_values.to(device)
     amp_ctx = get_amp_ctx()
         try:
             out = model.generate(
                 pixel_values,
+                max_length=MAX_LEN,
                 num_beams=5,
                 do_sample=False,
                 return_dict_in_generate=True,
             if "out of memory" in str(e).lower():
                 out = model.generate(
                     pixel_values,
+                    max_length=MAX_LEN,
                     num_beams=1,
                     do_sample=False,
                     return_dict_in_generate=True,
     return [(t if t is not None else "") for t in (s or "").split("|")]
 def highlight_tokens_with_tooltips(
     line_text: str, df_tok: pd.DataFrame, red_threshold: float, metric_column: str
 ) -> str:
     """Insert HTML spans around tokens whose chosen metric exceeds threshold.
     The metric column can be "rel_prob" (relative probability) or
     "entropy".  Tokens with a value strictly greater than red_threshold
     will be wrapped in a span with a tooltip listing alternative
     predictions and their probabilities.
     Parameters
     ----------
     line_text : str
         Values above this threshold will be highlighted.
     metric_column : str
         Column name in df_tok used for thresholding.
     Returns
     -------
     str
 # ----------------------------------------------------------------------
 # Main OCR wrapper for Gradio
 #
+def run_ocr(
+    image: np.ndarray | None,
+    xml_file: tuple | None,
+    apply_gray: bool,
+    apply_bin: bool,
+    highlight_metric: str,
+):
+    """Run the OCR pipeline on user inputs and return results for Gradio.
+    Parameters
+    ----------
+    image : numpy.ndarray or None
+        The uploaded image converted to a NumPy array by Gradio.  If
+        None, the function returns empty results.
+    xml_file : tuple or None
+        A tuple representing the uploaded XML file as provided by
+        gr.File.  The first element is the file name and the second is
+        bytes.  If None, no segmentation is applied and the entire
+        image is processed as a single line.
+    apply_gray : bool
+        Whether to convert the image to grayscale before OCR.
+    apply_bin : bool
+        Whether to apply binarization (Otsu threshold) before OCR.  If
+        selected, grayscale conversion is applied first automatically.
+    highlight_metric : str
+        Which metric to use for highlighting ("Relative Probability" or
+        "Entropy").
+    Returns
+    -------
+    overlay_img : PIL.Image or None
+        Image with segmentation boxes drawn.  None if no input image.
+    predictions_html : str
+        HTML formatted predicted text with highlighted tokens.
+    df_scores : pandas.DataFrame or None
+        DataFrame of per‑token statistics.  None if no input image.
+    txt_file_path : str or None
+        Path to a temporary .txt file containing the plain predicted text.
+    csv_file_path : str or None
+        Path to a temporary CSV file containing the extended token scores.
+    """
+    if image is None:
+        return None, "", None, None, None
+    # Convert the numpy array to a PIL image
     pil_img = Image.fromarray(image).convert("RGB")
+    # Apply preprocessing as requested
     if apply_gray:
         pil_img = pil_img.convert("L").convert("RGB")
     if apply_bin:
         img_cv = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
         _, bin_img = cv2.threshold(img_cv, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
         pil_img = Image.fromarray(bin_img).convert("RGB")
+    # Parse segmentation boxes if XML provided
     boxes = []
+    if xml_file is not None and isinstance(xml_file, tuple) and len(xml_file) == 2:
+        # xml_file comes as (name, bytes) from Gradio
+        _, xml_bytes = xml_file
+        try:
             boxes = parse_boxes_from_xml(xml_bytes, level="line", image_size=pil_img.size)
             boxes = sort_boxes_reading_order(boxes)[:MAX_LINES]
+        except Exception:
+            boxes = []
+    # Run OCR for each segmented line or the whole image
     dfs = []
+    concatenated_parts = []
+    line_text_by_id = {}
     if boxes:
+        pad = 2
         for idx, b in enumerate(boxes, 1):
+            # Create a tight crop around the line
+            if "points" in b:
+                pts = b["points"]
+                mask = Image.new("L", pil_img.size, 0)
+                ImageDraw.Draw(mask).polygon(pts, outline=1, fill=255)
+                seg_img = Image.new("RGB", pil_img.size, (255, 255, 255))
+                seg_img.paste(pil_img, mask=mask)
+                xs = [x for x, y in pts]
+                ys = [y for x, y in pts]
+                x1 = max(0, int(min(xs) - pad))
+                y1 = max(0, int(min(ys) - pad))
+                x2 = min(pil_img.width, int(max(xs) + pad))
+                y2 = min(pil_img.height, int(max(ys) + pad))
+                crop = seg_img.crop((x1, y1, x2, y2))
+            else:
+                x1, y1, x2, y2 = b["bbox"]
+                x1p = max(0, x1 - pad)
+                y1p = max(0, y1 - pad)
+                x2p = min(pil_img.width, x2 + pad)
+                y2p = min(pil_img.height, y2 + pad)
+                crop = pil_img.crop((x1p, y1p, x2p, y2p))
+            # Run inference on the crop
+            seg_text, df_tok = predict_and_score_once(crop, line_id=idx, topk=TOPK)
             seg_text = clean_text(seg_text)
+            # Choose metric
             if highlight_metric == "Relative Probability":
+                red_threshold = REL_PROB_TH
+                metric_col = "rel_prob"
             else:
+                red_threshold = 0.10  # heuristic threshold for entropy
+                metric_col = "entropy"
+            # Highlight uncertain tokens
+            seg_text_flagged = highlight_tokens_with_tooltips(seg_text, df_tok, red_threshold, metric_col)
+            concatenated_parts.append(seg_text_flagged)
+            df_tok["line_id"] = idx
             dfs.append(df_tok)
+            line_text_by_id[idx] = seg_text_flagged
+        predicted_html = "<br>".join(concatenated_parts).strip()
         df_all = pd.concat(dfs, ignore_index=True)
     else:
+        # Single pass on the whole image
+        seg_text, df_all = predict_and_score_once(pil_img, line_id=1, topk=TOPK)
         seg_text = clean_text(seg_text)
         if highlight_metric == "Relative Probability":
+            red_threshold = REL_PROB_TH
+            metric_col = "rel_prob"
         else:
+            red_threshold = 0.10
+            metric_col = "entropy"
+        seg_text_flagged = highlight_tokens_with_tooltips(seg_text, df_all, red_threshold, metric_col)
+        predicted_html = seg_text_flagged
+        line_text_by_id[1] = seg_text_flagged
+    # Draw overlay image
     overlay_img = draw_boxes(pil_img, boxes) if boxes else pil_img
+    # Create downloads
+    df_all = df_all.copy()
+    # Drop the last empty token per line to tidy up output
+    df_all.sort_values(["line_id", "seq_pos"], inplace=True)
+    to_drop = []
+    for line_id, group in df_all.groupby("line_id"):
+        if group.iloc[-1]["token"].strip() == "":
+            to_drop.append(group.index[-1])
+    df_all = df_all.drop(index=to_drop)
+    # Prepare plain text by stripping HTML tags and replacing <br>
+    plain_text = re.sub(r"<[^>]*>", "", predicted_html.replace("<br>", "\n"))
+    # Write temporary files
+    txt_path = None
+    csv_path = None
+    try:
+        txt_fd = io.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8")
+        txt_fd.write(plain_text)
+        txt_fd.flush()
+        txt_path = txt_fd.name
+        txt_fd.close()
+    except Exception:
+        txt_path = None
+    try:
+        csv_fd = io.NamedTemporaryFile(delete=False, suffix=".csv", mode="w", encoding="utf-8")
+        df_all.to_csv(csv_fd, index=False)
+        csv_fd.flush()
+        csv_path = csv_fd.name
+        csv_fd.close()
+    except Exception:
+        csv_path = None
+    return overlay_img, predicted_html, df_all, txt_path, csv_path
 # ----------------------------------------------------------------------
 # Build Gradio Interface
 #
 def create_gradio_interface():
+    """Create and return the Gradio Blocks interface."""
     with gr.Blocks(title="Old Nepali HTR") as demo:
+        gr.Markdown("""# Old Nepali HTR (Gradio)\n\nUpload a scanned image and (optionally) a segmentation XML file.  Choose preprocessing\nsteps and a highlight metric, then click **Run OCR** to extract the text.\nUncertain tokens are highlighted with tooltips showing alternative predictions.\nYou can edit the plain text below and download it or the full token scores.""")
         with gr.Row():
             image_input = gr.Image(type="numpy", label="Upload Image")
+            xml_input = gr.File(label="Upload segmentation XML (optional)")
         with gr.Row():
             apply_gray_checkbox = gr.Checkbox(label="Convert to Grayscale", value=False)
             apply_bin_checkbox = gr.Checkbox(label="Binarize", value=False)
+            metric_radio = gr.Radio([
+                "Relative Probability",
+                "Entropy",
+            ], label="Highlight tokens by", value="Relative Probability")
         run_btn = gr.Button("Run OCR")
+        # Outputs
+        overlay_output = gr.Image(label="Detected Regions")
+        predictions_output = gr.HTML(label="Predictions (HTML)")
+        df_output = gr.DataFrame(label="Token Scores", interactive=False)
+        txt_file_output = gr.File(label="Download OCR Prediction (.txt)")
+        csv_file_output = gr.File(label="Download Token Scores (.csv)")
+        # Editable text
+        edited_text = gr.Textbox(
+            label="Edit full predicted text", lines=8, interactive=True
+        )
+        download_edited_btn = gr.Button("Download edited text")
+        # Callback for OCR
+        def on_run(image, xml, gray, binarize, metric):
+            return run_ocr(image, xml, gray, binarize, metric)
         run_btn.click(
+            fn=on_run,
+            inputs=[image_input, xml_input, apply_gray_checkbox, apply_bin_checkbox, metric_radio],
+            outputs=[overlay_output, predictions_output, df_output, txt_file_output, csv_file_output],
         )
+        # Populate editable text with plain text from predictions
+        def update_edited_text(pred_html):
+            plain = re.sub(r"<[^>]*>", "", (pred_html or "").replace("<br>", "\n"))
+            return plain
+        predictions_output.change(
+            fn=update_edited_text,
+            inputs=predictions_output,
+            outputs=edited_text,
+        )
+        # Download edited text by writing to a temporary file
+        def download_edited(txt):
+            if not txt:
+                return None
+            try:
+                fd = io.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8")
+                fd.write(txt)
+                fd.flush()
+                path = fd.name
+                fd.close()
+                return path
+            except Exception:
+                return None
+        download_edited_btn.click(
+            fn=download_edited,
+            inputs=edited_text,
+            outputs=txt_file_output,
+        )
     return demo