Spaces:

imperiusrex
/

Handwritten_OCR

Sleeping

App Files Files Community

imperiusrex commited on Jul 31

Commit

631b41e

verified ·

1 Parent(s): 5506faf

Create app.py

Browse files

Files changed (1) hide show

app.py +244 -0

app.py ADDED Viewed

	@@ -0,0 +1,244 @@

+# Import the GPU decorator for ZeroGPU Spaces
+# This will be a no-op if the space is not configured for ZeroGPU
+# but it is required for the specified hardware to work correctly.
+from spaces import GPU
+import os
+import cv2
+import numpy as np
+import torch
+import tempfile
+import gradio as gr
+from PIL import Image
+from pdf2image import convert_from_path
+from reportlab.lib.pagesizes import letter
+from reportlab.pdfgen import canvas
+from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+from paddleocr import PaddleOCR, TextDetection
+# Set the GPU device if available
+# The `spaces.GPU` decorator handles the dynamic GPU allocation, but we still need to
+# specify the device for PyTorch and other GPU-enabled libraries.
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {device}")
+# --- MODEL LOADING ---
+# Load models globally so they are only initialized once when the app starts.
+# Initialize the PaddleOCR detection model
+# `use_angle_cls=False` is set for efficiency, as we are already using
+# perspective warping to straighten the text.
+print("Initializing PaddleOCR text detection model...")
+try:
+    # Use the PaddleOCR class with a specific model for detection only
+    det_model = PaddleOCR(use_angle_cls=False, lang='en', use_gpu=torch.cuda.is_available(), show_log=False)
+except Exception as e:
+    print(f"Error initializing PaddleOCR: {e}")
+    det_model = None
+# Initialize the TrOCR recognition model and processor
+print("Initializing TrOCR text recognition model...")
+try:
+    trocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-handwritten")
+    trocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-large-handwritten")
+    trocr_model.eval()
+    trocr_model.to(device)
+except Exception as e:
+    print(f"Error initializing TrOCR: {e}")
+    trocr_model = None
+    trocr_processor = None
+# Helper function to save a temp image
+def save_temp_image(img):
+    """Save an image array to a temporary file and return the path."""
+    temp_fd, temp_path = tempfile.mkstemp(suffix='.png')
+    cv2.imwrite(temp_path, img)
+    os.close(temp_fd)
+    return temp_path
+def process_image_page(img):
+    """
+    Process a single image to detect polygons, crop regions, and recognize text.
+    Returns a list of [box, text] for each cropped region and the original PIL image.
+    """
+    if det_model is None or trocr_model is None:
+        raise RuntimeError("OCR models are not loaded. Please check logs for errors.")
+    # Convert OpenCV image (BGR numpy array) to PIL Image (RGB)
+    original_pil_image = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
+    # PaddleOCR's predict method takes a file path, so we'll save the image to a temp file
+    temp_image_path = save_temp_image(img)
+    # Detect polygons using PaddleOCR
+    # The `ocr` method in PaddleOCR returns both detection and recognition results.
+    # We will use it just for the detection polygons.
+    ocr_result = det_model.ocr(temp_image_path)
+    os.remove(temp_image_path)
+    arr = []
+    # The OCR result is a list of lists, where each inner list represents a text line.
+    # The first element is the bounding box coordinates.
+    for line in ocr_result[0]:
+        arr.append(line[0])
+    print(f"Detected {len(arr)} lines in this page.")
+    cropped_images = []
+    for box in arr:
+        box = np.array(box, dtype=np.float32)
+        # Compute width and height of the straightened image
+        width_a = np.linalg.norm(box[0] - box[1])
+        width_b = np.linalg.norm(box[2] - box[3])
+        height_a = np.linalg.norm(box[0] - box[3])
+        height_b = np.linalg.norm(box[1] - box[2])
+        width = int(max(width_a, width_b))
+        height = int(max(height_a, height_b))
+        dst_rect = np.array([
+            [0, 0],
+            [width - 1, 0],
+            [width - 1, height - 1],
+            [0, height - 1]
+        ], dtype=np.float32)
+        # Perspective transform
+        M = cv2.getPerspectiveTransform(box, dst_rect)
+        warped = cv2.warpPerspective(img, M, (width, height))
+        cropped_images.append(warped)
+    # Reverse cropped images and corresponding boxes
+    cropped_images.reverse()
+    arr.reverse()
+    # Text recognition with TrOCR
+    results = []
+    for i, crop in enumerate(cropped_images):
+        image_pil = Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
+        pixel_values = trocr_processor(images=image_pil, return_tensors="pt").pixel_values.to(device)
+        with torch.no_grad():
+            generated_ids = trocr_model.generate(pixel_values, max_new_tokens=64)
+            generated_text = trocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        results.append([arr[i], generated_text])
+        print(f"Recognized: {generated_text}")
+    return results, original_pil_image
+def process_file_and_create_pdf(file):
+    """
+    Main function to process a file (image or PDF) and return a path to a new PDF.
+    The @GPU decorator ensures this function is run on the GPU.
+    """
+    if file is None:
+        return None, "Please upload a file."
+    temp_output_dir = tempfile.mkdtemp()
+    output_pdf_path = os.path.join(temp_output_dir, "ocr_results.pdf")
+    try:
+        if file.name.lower().endswith('.pdf'):
+            # Convert PDF to images
+            print(f"Converting PDF {file.name} to images...")
+            # Use `poppler_path` if poppler is installed on the system, otherwise
+            # it might be necessary to install it via a `packages.txt` file.
+            # Here we assume it's available.
+            images = convert_from_path(file.name, dpi=300)
+            c = canvas.Canvas(output_pdf_path, pagesize=letter)
+            width, height = letter
+            for page_num, page in enumerate(images):
+                print(f"\nProcessing page {page_num + 1}")
+                img_cv = cv2.cvtColor(np.array(page), cv2.COLOR_RGB2BGR)
+                results, original_image = process_image_page(img_cv)
+                c.setFont("Helvetica-Bold", 14)
+                c.drawString(50, height - 40, f"Page {page_num + 1} - OCR Results")
+                y = height - 60
+                c.setFont("Helvetica", 12)
+                for _, text in results:
+                    c.drawString(50, y, text)
+                    y -= 15
+                    if y < 50:
+                        c.showPage()
+                        c.setFont("Helvetica-Bold", 14)
+                        c.drawString(50, height - 40, f"Page {page_num + 1} (cont.) - OCR Results")
+                        y = height - 60
+                c.showPage()
+            c.save()
+        else: # Handle single image file
+            img_cv = cv2.imread(file.name)
+            if img_cv is None:
+                raise ValueError("Failed to load image.")
+            results, original_image = process_image_page(img_cv)
+            c = canvas.Canvas(output_pdf_path, pagesize=letter)
+            width, height = letter
+            c.setFont("Helvetica-Bold", 14)
+            c.drawString(50, height - 40, "Image OCR Results")
+            # The input file from Gradio is a temp file that will be cleaned up.
+            # We can't display it directly in the PDF from its path.
+            # To draw it in the PDF, we save it to a new temporary path.
+            temp_img_path = os.path.join(temp_output_dir, "original_image.png")
+            original_image.save(temp_img_path)
+            # Draw the image on the PDF
+            c.drawImage(temp_img_path, 50, height - 300, width=200, preserveAspectRatio=True)
+            y = height - 350
+            c.setFont("Helvetica", 12)
+            for _, text in results:
+                c.drawString(50, y, text)
+                y -= 15
+                if y < 50:
+                    c.showPage()
+                    c.setFont("Helvetica", 12)
+                    y = height - 50
+            c.save()
+            os.remove(temp_img_path)
+        return output_pdf_path
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        # Clean up temporary directory on error
+        # shutil.rmtree(temp_output_dir)
+        return None
+# Gradio Interface
+# The `@GPU` decorator is used here to ensure this function runs on a GPU.
+@GPU
+def process_file_for_gradio(file):
+    # This wrapper function is needed because Gradio's `File` component passes a temp file.
+    # We call our main processing function and return the path to the output PDF.
+    output_path = process_file_and_create_pdf(file)
+    if output_path is None:
+        return None
+    return output_path
+demo = gr.Interface(
+    fn=process_file_for_gradio,
+    inputs=gr.File(label="Upload an Image (PNG, JPG) or a PDF", file_types=['.png', '.jpg', '.jpeg', '.pdf']),
+    outputs=gr.File(label="Download OCR Results PDF"),
+    title="OCR App with PaddleOCR and TrOCR",
+    description="Upload an image or a multi-page PDF to get an output PDF with the recognized text from each page.",
+    examples=[
+        # Here you can provide paths to example files in your repo
+        # "example.png",
+        # "example.pdf"
+    ]
+)
+if __name__ == "__main__":
+    # You will need to set the hardware configuration in the `README.md` file
+    # of your Hugging Face Space for the GPU to be available.
+    demo.launch()