Spaces:

imperiusrex
/

Handwritten_OCR

Sleeping

App Files Files Community

imperiusrex commited on Jul 31

Commit

7f16886

verified ·

1 Parent(s): f40a04a

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -27

app.py CHANGED Viewed

@@ -1,6 +1,3 @@
-# Import the GPU decorator for ZeroGPU Spaces
-# This will be a no-op if the space is not configured for ZeroGPU
-# but it is required for the specified hardware to work correctly.
 from spaces import GPU
 import os
@@ -8,13 +5,19 @@ import cv2
 import numpy as np
 import torch
 import tempfile
 import gradio as gr
 from PIL import Image
 from pdf2image import convert_from_path
 from reportlab.lib.pagesizes import letter
 from reportlab.pdfgen import canvas
 from transformers import TrOCRProcessor, VisionEncoderDecoderModel
-from paddleocr import PaddleOCR, TextDetection
 # Set the GPU device if available
 # The `spaces.GPU` decorator handles the dynamic GPU allocation, but we still need to
@@ -80,8 +83,10 @@ def process_image_page(img):
     arr = []
     # The OCR result is a list of lists, where each inner list represents a text line.
     # The first element is the bounding box coordinates.
-    for line in ocr_result[0]:
-        arr.append(line[0])
     print(f"Detected {len(arr)} lines in this page.")
@@ -110,7 +115,7 @@ def process_image_page(img):
         warped = cv2.warpPerspective(img, M, (width, height))
         cropped_images.append(warped)
-    # Reverse cropped images and corresponding boxes
     cropped_images.reverse()
     arr.reverse()
@@ -135,27 +140,38 @@ def process_file_and_create_pdf(file):
     The @GPU decorator ensures this function is run on the GPU.
     """
     if file is None:
-        return None, "Please upload a file."
     temp_output_dir = tempfile.mkdtemp()
     output_pdf_path = os.path.join(temp_output_dir, "ocr_results.pdf")
     try:
         if file.name.lower().endswith('.pdf'):
             # Convert PDF to images
             print(f"Converting PDF {file.name} to images...")
-            # Use `poppler_path` if poppler is installed on the system, otherwise
-            # it might be necessary to install it via a `packages.txt` file.
-            # Here we assume it's available.
             images = convert_from_path(file.name, dpi=300)
             c = canvas.Canvas(output_pdf_path, pagesize=letter)
             width, height = letter
             for page_num, page in enumerate(images):
                 print(f"\nProcessing page {page_num + 1}")
                 img_cv = cv2.cvtColor(np.array(page), cv2.COLOR_RGB2BGR)
-                results, original_image = process_image_page(img_cv)
                 c.setFont("Helvetica-Bold", 14)
                 c.drawString(50, height - 40, f"Page {page_num + 1} - OCR Results")
@@ -178,6 +194,15 @@ def process_file_and_create_pdf(file):
             if img_cv is None:
                 raise ValueError("Failed to load image.")
             results, original_image = process_image_page(img_cv)
             c = canvas.Canvas(output_pdf_path, pagesize=letter)
@@ -185,14 +210,11 @@ def process_file_and_create_pdf(file):
             c.setFont("Helvetica-Bold", 14)
             c.drawString(50, height - 40, "Image OCR Results")
-            # The input file from Gradio is a temp file that will be cleaned up.
-            # We can't display it directly in the PDF from its path.
-            # To draw it in the PDF, we save it to a new temporary path.
             temp_img_path = os.path.join(temp_output_dir, "original_image.png")
             original_image.save(temp_img_path)
-            # Draw the image on the PDF
             c.drawImage(temp_img_path, 50, height - 300, width=200, preserveAspectRatio=True)
             y = height - 350
             c.setFont("Helvetica", 12)
@@ -204,15 +226,15 @@ def process_file_and_create_pdf(file):
                     c.setFont("Helvetica", 12)
                     y = height - 50
             c.save()
-            os.remove(temp_img_path)
-        return output_pdf_path
     except Exception as e:
         print(f"An error occurred: {e}")
         # Clean up temporary directory on error
-        # shutil.rmtree(temp_output_dir)
-        return None
 # Gradio Interface
 # The `@GPU` decorator is used here to ensure this function runs on a GPU.
@@ -220,17 +242,20 @@ def process_file_and_create_pdf(file):
 def process_file_for_gradio(file):
     # This wrapper function is needed because Gradio's `File` component passes a temp file.
     # We call our main processing function and return the path to the output PDF.
-    output_path = process_file_and_create_pdf(file)
     if output_path is None:
-        return None
-    return output_path
 demo = gr.Interface(
     fn=process_file_for_gradio,
     inputs=gr.File(label="Upload an Image (PNG, JPG) or a PDF", file_types=['.png', '.jpg', '.jpeg', '.pdf']),
-    outputs=gr.File(label="Download OCR Results PDF"),
     title="OCR App with PaddleOCR and TrOCR",
-    description="Upload an image or a multi-page PDF to get an output PDF with the recognized text from each page.",
     examples=[
         # Here you can provide paths to example files in your repo
         # "example.png",

 from spaces import GPU
 import os
 import numpy as np
 import torch
 import tempfile
+import shutil
 import gradio as gr
 from PIL import Image
 from pdf2image import convert_from_path
 from reportlab.lib.pagesizes import letter
 from reportlab.pdfgen import canvas
 from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+from paddleocr import PaddleOCR
+from paddleocr.ppocr.utils.logging import disable_logger
+from IPython.display import display
+# Disable PaddleOCR logging for a cleaner output
+disable_logger()
 # Set the GPU device if available
 # The `spaces.GPU` decorator handles the dynamic GPU allocation, but we still need to
     arr = []
     # The OCR result is a list of lists, where each inner list represents a text line.
     # The first element is the bounding box coordinates.
+    # Check if ocr_result is not None and has at least one element
+    if ocr_result and ocr_result[0]:
+        for line in ocr_result[0]:
+            arr.append(line[0])
     print(f"Detected {len(arr)} lines in this page.")
         warped = cv2.warpPerspective(img, M, (width, height))
         cropped_images.append(warped)
+    # Reverse cropped images and corresponding boxes to match reading order
     cropped_images.reverse()
     arr.reverse()
     The @GPU decorator ensures this function is run on the GPU.
     """
     if file is None:
+        return None, None
     temp_output_dir = tempfile.mkdtemp()
     output_pdf_path = os.path.join(temp_output_dir, "ocr_results.pdf")
+    input_image_for_display = None
     try:
         if file.name.lower().endswith('.pdf'):
             # Convert PDF to images
             print(f"Converting PDF {file.name} to images...")
             images = convert_from_path(file.name, dpi=300)
+            if images:
+                # Set the first page as the image to display
+                input_image_for_display = images[0]
             c = canvas.Canvas(output_pdf_path, pagesize=letter)
             width, height = letter
             for page_num, page in enumerate(images):
                 print(f"\nProcessing page {page_num + 1}")
                 img_cv = cv2.cvtColor(np.array(page), cv2.COLOR_RGB2BGR)
+                # Check if the background is dark and text is light (simple heuristic)
+                gray_image = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
+                avg_intensity = np.mean(gray_image)
+                if avg_intensity < 100:  # Example threshold
+                    print("Inverting colors for dark background.")
+                    img_cv = cv2.bitwise_not(img_cv)
+                results, _ = process_image_page(img_cv)
                 c.setFont("Helvetica-Bold", 14)
                 c.drawString(50, height - 40, f"Page {page_num + 1} - OCR Results")
             if img_cv is None:
                 raise ValueError("Failed to load image.")
+            input_image_for_display = Image.fromarray(cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB))
+            # Check if the background is dark and text is light (simple heuristic)
+            gray_image = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
+            avg_intensity = np.mean(gray_image)
+            if avg_intensity < 100:  # Example threshold
+                print("Inverting colors for dark background.")
+                img_cv = cv2.bitwise_not(img_cv)
             results, original_image = process_image_page(img_cv)
             c = canvas.Canvas(output_pdf_path, pagesize=letter)
             c.setFont("Helvetica-Bold", 14)
             c.drawString(50, height - 40, "Image OCR Results")
+            # Draw the original image on the PDF for context
             temp_img_path = os.path.join(temp_output_dir, "original_image.png")
             original_image.save(temp_img_path)
             c.drawImage(temp_img_path, 50, height - 300, width=200, preserveAspectRatio=True)
+            os.remove(temp_img_path)
             y = height - 350
             c.setFont("Helvetica", 12)
                     c.setFont("Helvetica", 12)
                     y = height - 50
             c.save()
+        return output_pdf_path, input_image_for_display
     except Exception as e:
         print(f"An error occurred: {e}")
         # Clean up temporary directory on error
+        if os.path.exists(temp_output_dir):
+            shutil.rmtree(temp_output_dir)
+        return None, None
 # Gradio Interface
 # The `@GPU` decorator is used here to ensure this function runs on a GPU.
 def process_file_for_gradio(file):
     # This wrapper function is needed because Gradio's `File` component passes a temp file.
     # We call our main processing function and return the path to the output PDF.
+    output_path, input_image = process_file_and_create_pdf(file)
     if output_path is None:
+        return None, None
+    return output_path, input_image
 demo = gr.Interface(
     fn=process_file_for_gradio,
     inputs=gr.File(label="Upload an Image (PNG, JPG) or a PDF", file_types=['.png', '.jpg', '.jpeg', '.pdf']),
+    outputs=[
+        gr.File(label="Download OCR Results PDF", interactive=False, visible=True),
+        gr.Image(label="Uploaded Image Preview", interactive=False)
+    ],
     title="OCR App with PaddleOCR and TrOCR",
+    description="Upload an image or a multi-page PDF to get an output PDF with the recognized text from each page. The output PDF will be downloaded automatically.",
     examples=[
         # Here you can provide paths to example files in your repo
         # "example.png",