Spaces:

imperiusrex
/

Handwritten_OCR

Sleeping

File size: 10,447 Bytes

# Import the GPU decorator for ZeroGPU Spaces
from spaces import GPU

import os
import cv2
import numpy as np
import torch
import tempfile
import shutil
import gradio as gr
from PIL import Image
from pdf2image import convert_from_path
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from paddleocr import PaddleOCR
import logging

# Disable PaddleOCR logging for a cleaner output
logging.disable(logging.WARNING)

# Set the GPU device if available
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Using device: {device}")

# --- MODEL LOADING ---
# Load models globally so they are only initialized once when the app starts.

# Initialize the PaddleOCR detection model
print("Initializing PaddleOCR text detection model...")
try:
    det_model = PaddleOCR(use_angle_cls=False, lang='en', use_gpu=torch.cuda.is_available(), show_log=False)
except Exception as e:
    print(f"Error initializing PaddleOCR: {e}")
    det_model = None

# Initialize the TrOCR recognition model and processor
print("Initializing TrOCR text recognition model...")
try:
    trocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-handwritten")
    trocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-large-handwritten")
    trocr_model.eval()
    trocr_model.to(device)
except Exception as e:
    print(f"Error initializing TrOCR: {e}")
    trocr_model = None
    trocr_processor = None

# Helper function to save a temp image
def save_temp_image(img):
    """Save an image array to a temporary file and return the path."""
    temp_fd, temp_path = tempfile.mkstemp(suffix='.png')
    cv2.imwrite(temp_path, img)
    os.close(temp_fd)
    return temp_path

def process_image_page(img):
    """
    Process a single image to detect polygons, crop regions, and recognize text.
    Returns a list of [box, text] for each cropped region and the original PIL image.
    """
    if det_model is None or trocr_model is None:
        raise RuntimeError("OCR models are not loaded. Please check logs for errors.")

    # Convert OpenCV image (BGR numpy array) to PIL Image (RGB)
    original_pil_image = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    
    # PaddleOCR's predict method takes a file path, so we'll save the image to a temp file
    temp_image_path = save_temp_image(img)
    
    # Detect polygons using PaddleOCR
    ocr_result = det_model.ocr(temp_image_path)
    os.remove(temp_image_path)
    
    arr = []
    # The OCR result is a list of lists, where each inner list represents a text line.
    if ocr_result and ocr_result[0]:
        for line in ocr_result[0]:
            arr.append(line[0])

    print(f"Detected {len(arr)} lines in this page.")

    cropped_images = []
    for box in arr:
        box = np.array(box, dtype=np.float32)

        # Compute width and height of the straightened image
        width_a = np.linalg.norm(box[0] - box[1])
        width_b = np.linalg.norm(box[2] - box[3])
        height_a = np.linalg.norm(box[0] - box[3])
        height_b = np.linalg.norm(box[1] - box[2])

        width = int(max(width_a, width_b))
        height = int(max(height_a, height_b))

        dst_rect = np.array([
            [0, 0],
            [width - 1, 0],
            [width - 1, height - 1],
            [0, height - 1]
        ], dtype=np.float32)

        # Perspective transform
        M = cv2.getPerspectiveTransform(box, dst_rect)
        warped = cv2.warpPerspective(img, M, (width, height))
        cropped_images.append(warped)

    # Reverse cropped images and corresponding boxes to match reading order
    cropped_images.reverse()
    arr.reverse()

    # Text recognition with TrOCR
    results = []
    for i, crop in enumerate(cropped_images):
        image_pil = Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
        pixel_values = trocr_processor(images=image_pil, return_tensors="pt").pixel_values.to(device)

        with torch.no_grad():
            generated_ids = trocr_model.generate(pixel_values, max_new_tokens=64)
            generated_text = trocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

        results.append([arr[i], generated_text])
        print(f"Recognized: {generated_text}")

    return results, original_pil_image

def process_file_and_create_pdf(file):
    """
    Main function to process a file (image or PDF) and return a path to a new PDF.
    This function will ensure the temporary output directory is cleaned up safely.
    The @GPU decorator ensures this function is run on the GPU.
    """
    if file is None:
        return None, None

    temp_output_dir = tempfile.mkdtemp()
    output_pdf_path = os.path.join(temp_output_dir, "ocr_results.pdf")
    input_image_for_display = None

    try:
        if file.name.lower().endswith('.pdf'):
            # Convert PDF to images
            print(f"Converting PDF {file.name} to images...")
            images = convert_from_path(file.name, dpi=300)
            
            if images:
                # Set the first page as the image to display
                input_image_for_display = images[0]
            
            c = canvas.Canvas(output_pdf_path, pagesize=letter)
            width, height = letter
            
            for page_num, page in enumerate(images):
                print(f"\nProcessing page {page_num + 1}")
                img_cv = cv2.cvtColor(np.array(page), cv2.COLOR_RGB2BGR)

                # Check if the background is dark and text is light (simple heuristic)
                gray_image = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
                avg_intensity = np.mean(gray_image)
                if avg_intensity < 100:
                    print("Inverting colors for dark background.")
                    img_cv = cv2.bitwise_not(img_cv)
                
                results, _ = process_image_page(img_cv)

                c.setFont("Helvetica-Bold", 14)
                c.drawString(50, height - 40, f"Page {page_num + 1} - OCR Results")
                
                y = height - 60
                c.setFont("Helvetica", 12)
                for _, text in results:
                    c.drawString(50, y, text)
                    y -= 15
                    if y < 50:
                        c.showPage()
                        c.setFont("Helvetica-Bold", 14)
                        c.drawString(50, height - 40, f"Page {page_num + 1} (cont.) - OCR Results")
                        y = height - 60
                c.showPage()
            c.save()

        else: # Handle single image file
            img_cv = cv2.imread(file.name)
            if img_cv is None:
                raise ValueError("Failed to load image.")
            
            input_image_for_display = Image.fromarray(cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB))

            # Check if the background is dark and text is light (simple heuristic)
            gray_image = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
            avg_intensity = np.mean(gray_image)
            if avg_intensity < 100:
                print("Inverting colors for dark background.")
                img_cv = cv2.bitwise_not(img_cv)

            results, original_image = process_image_page(img_cv)

            c = canvas.Canvas(output_pdf_path, pagesize=letter)
            width, height = letter
            c.setFont("Helvetica-Bold", 14)
            c.drawString(50, height - 40, "Image OCR Results")
            
            temp_img_path = os.path.join(temp_output_dir, "original_image.png")
            original_image.save(temp_img_path)
            c.drawImage(temp_img_path, 50, height - 300, width=200, preserveAspectRatio=True)
            os.remove(temp_img_path)
            
            y = height - 350
            c.setFont("Helvetica", 12)
            for _, text in results:
                c.drawString(50, y, text)
                y -= 15
                if y < 50:
                    c.showPage()
                    c.setFont("Helvetica", 12)
                    y = height - 50
            c.save()

        print(f"Generated PDF path: {output_pdf_path}")
        return output_pdf_path, input_image_for_display
    
    except Exception as e:
        print(f"An error occurred: {e}")
        # Return None, None on error
        return None, None
    finally:
        # Ensure temporary directory is cleaned up after the function returns
        if os.path.exists(temp_output_dir):
            print(f"Cleaning up temporary directory: {temp_output_dir}")
            shutil.rmtree(temp_output_dir)

# Gradio Interface
@GPU
def process_file_for_gradio(image_file, pdf_file):
    """
    Wrapper function for Gradio interface with separate inputs.
    This function checks which input was provided and calls the main
    processing logic accordingly.
    """
    if image_file is not None:
        # The gr.Image component returns a PIL Image object
        # We need to save it to a temporary file for the main function
        temp_dir = tempfile.mkdtemp()
        image_path = os.path.join(temp_dir, "uploaded_image.png")
        image_file.save(image_path)
        
        # Create a mock file object to be compatible with the main function
        class MockFile:
            def __init__(self, name):
                self.name = name
        
        mock_file = MockFile(image_path)
        output_path, input_image = process_file_and_create_pdf(mock_file)
        shutil.rmtree(temp_dir)
        return output_path, input_image
    
    elif pdf_file is not None:
        # The gr.File component passes a temporary file object directly
        output_path, input_image = process_file_and_create_pdf(pdf_file)
        return output_path, input_image
    
    else:
        return None, None


demo = gr.Interface(
    fn=process_file_for_gradio,
    inputs=[
        gr.Image(label="Upload an Image", type="pil"),
        gr.File(label="Upload a PDF", file_types=['.pdf'])
    ],
    outputs=[
        gr.File(label="Download OCR Results PDF", interactive=False, visible=True),
        gr.Image(label="Uploaded File Preview", interactive=False)
    ],
    title="OCR App with PaddleOCR and TrOCR",
    description="Upload an image or a multi-page PDF to get an output PDF with the recognized text from each page. The output PDF will be downloaded automatically.",
)

if __name__ == "__main__":
    demo.launch()