Spaces:
Sleeping
Sleeping
| # Import the GPU decorator for ZeroGPU Spaces | |
| # This will be a no-op if the space is not configured for ZeroGPU | |
| # but it is required for the specified hardware to work correctly. | |
| from spaces import GPU | |
| import os | |
| import cv2 | |
| import numpy as np | |
| import torch | |
| import tempfile | |
| import gradio as gr | |
| from PIL import Image | |
| from pdf2image import convert_from_path | |
| from reportlab.lib.pagesizes import letter | |
| from reportlab.pdfgen import canvas | |
| from transformers import TrOCRProcessor, VisionEncoderDecoderModel | |
| from paddleocr import PaddleOCR, TextDetection | |
| # Set the GPU device if available | |
| # The `spaces.GPU` decorator handles the dynamic GPU allocation, but we still need to | |
| # specify the device for PyTorch and other GPU-enabled libraries. | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Using device: {device}") | |
| # --- MODEL LOADING --- | |
| # Load models globally so they are only initialized once when the app starts. | |
| # Initialize the PaddleOCR detection model | |
| # `use_angle_cls=False` is set for efficiency, as we are already using | |
| # perspective warping to straighten the text. | |
| print("Initializing PaddleOCR text detection model...") | |
| try: | |
| # Use the PaddleOCR class with a specific model for detection only | |
| det_model = PaddleOCR(use_angle_cls=False, lang='en', use_gpu=torch.cuda.is_available(), show_log=False) | |
| except Exception as e: | |
| print(f"Error initializing PaddleOCR: {e}") | |
| det_model = None | |
| # Initialize the TrOCR recognition model and processor | |
| print("Initializing TrOCR text recognition model...") | |
| try: | |
| trocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-handwritten") | |
| trocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-large-handwritten") | |
| trocr_model.eval() | |
| trocr_model.to(device) | |
| except Exception as e: | |
| print(f"Error initializing TrOCR: {e}") | |
| trocr_model = None | |
| trocr_processor = None | |
| # Helper function to save a temp image | |
| def save_temp_image(img): | |
| """Save an image array to a temporary file and return the path.""" | |
| temp_fd, temp_path = tempfile.mkstemp(suffix='.png') | |
| cv2.imwrite(temp_path, img) | |
| os.close(temp_fd) | |
| return temp_path | |
| def process_image_page(img): | |
| """ | |
| Process a single image to detect polygons, crop regions, and recognize text. | |
| Returns a list of [box, text] for each cropped region and the original PIL image. | |
| """ | |
| if det_model is None or trocr_model is None: | |
| raise RuntimeError("OCR models are not loaded. Please check logs for errors.") | |
| # Convert OpenCV image (BGR numpy array) to PIL Image (RGB) | |
| original_pil_image = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) | |
| # PaddleOCR's predict method takes a file path, so we'll save the image to a temp file | |
| temp_image_path = save_temp_image(img) | |
| # Detect polygons using PaddleOCR | |
| # The `ocr` method in PaddleOCR returns both detection and recognition results. | |
| # We will use it just for the detection polygons. | |
| ocr_result = det_model.ocr(temp_image_path) | |
| os.remove(temp_image_path) | |
| arr = [] | |
| # The OCR result is a list of lists, where each inner list represents a text line. | |
| # The first element is the bounding box coordinates. | |
| for line in ocr_result[0]: | |
| arr.append(line[0]) | |
| print(f"Detected {len(arr)} lines in this page.") | |
| cropped_images = [] | |
| for box in arr: | |
| box = np.array(box, dtype=np.float32) | |
| # Compute width and height of the straightened image | |
| width_a = np.linalg.norm(box[0] - box[1]) | |
| width_b = np.linalg.norm(box[2] - box[3]) | |
| height_a = np.linalg.norm(box[0] - box[3]) | |
| height_b = np.linalg.norm(box[1] - box[2]) | |
| width = int(max(width_a, width_b)) | |
| height = int(max(height_a, height_b)) | |
| dst_rect = np.array([ | |
| [0, 0], | |
| [width - 1, 0], | |
| [width - 1, height - 1], | |
| [0, height - 1] | |
| ], dtype=np.float32) | |
| # Perspective transform | |
| M = cv2.getPerspectiveTransform(box, dst_rect) | |
| warped = cv2.warpPerspective(img, M, (width, height)) | |
| cropped_images.append(warped) | |
| # Reverse cropped images and corresponding boxes | |
| cropped_images.reverse() | |
| arr.reverse() | |
| # Text recognition with TrOCR | |
| results = [] | |
| for i, crop in enumerate(cropped_images): | |
| image_pil = Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)) | |
| pixel_values = trocr_processor(images=image_pil, return_tensors="pt").pixel_values.to(device) | |
| with torch.no_grad(): | |
| generated_ids = trocr_model.generate(pixel_values, max_new_tokens=64) | |
| generated_text = trocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| results.append([arr[i], generated_text]) | |
| print(f"Recognized: {generated_text}") | |
| return results, original_pil_image | |
| def process_file_and_create_pdf(file): | |
| """ | |
| Main function to process a file (image or PDF) and return a path to a new PDF. | |
| The @GPU decorator ensures this function is run on the GPU. | |
| """ | |
| if file is None: | |
| return None, "Please upload a file." | |
| temp_output_dir = tempfile.mkdtemp() | |
| output_pdf_path = os.path.join(temp_output_dir, "ocr_results.pdf") | |
| try: | |
| if file.name.lower().endswith('.pdf'): | |
| # Convert PDF to images | |
| print(f"Converting PDF {file.name} to images...") | |
| # Use `poppler_path` if poppler is installed on the system, otherwise | |
| # it might be necessary to install it via a `packages.txt` file. | |
| # Here we assume it's available. | |
| images = convert_from_path(file.name, dpi=300) | |
| c = canvas.Canvas(output_pdf_path, pagesize=letter) | |
| width, height = letter | |
| for page_num, page in enumerate(images): | |
| print(f"\nProcessing page {page_num + 1}") | |
| img_cv = cv2.cvtColor(np.array(page), cv2.COLOR_RGB2BGR) | |
| results, original_image = process_image_page(img_cv) | |
| c.setFont("Helvetica-Bold", 14) | |
| c.drawString(50, height - 40, f"Page {page_num + 1} - OCR Results") | |
| y = height - 60 | |
| c.setFont("Helvetica", 12) | |
| for _, text in results: | |
| c.drawString(50, y, text) | |
| y -= 15 | |
| if y < 50: | |
| c.showPage() | |
| c.setFont("Helvetica-Bold", 14) | |
| c.drawString(50, height - 40, f"Page {page_num + 1} (cont.) - OCR Results") | |
| y = height - 60 | |
| c.showPage() | |
| c.save() | |
| else: # Handle single image file | |
| img_cv = cv2.imread(file.name) | |
| if img_cv is None: | |
| raise ValueError("Failed to load image.") | |
| results, original_image = process_image_page(img_cv) | |
| c = canvas.Canvas(output_pdf_path, pagesize=letter) | |
| width, height = letter | |
| c.setFont("Helvetica-Bold", 14) | |
| c.drawString(50, height - 40, "Image OCR Results") | |
| # The input file from Gradio is a temp file that will be cleaned up. | |
| # We can't display it directly in the PDF from its path. | |
| # To draw it in the PDF, we save it to a new temporary path. | |
| temp_img_path = os.path.join(temp_output_dir, "original_image.png") | |
| original_image.save(temp_img_path) | |
| # Draw the image on the PDF | |
| c.drawImage(temp_img_path, 50, height - 300, width=200, preserveAspectRatio=True) | |
| y = height - 350 | |
| c.setFont("Helvetica", 12) | |
| for _, text in results: | |
| c.drawString(50, y, text) | |
| y -= 15 | |
| if y < 50: | |
| c.showPage() | |
| c.setFont("Helvetica", 12) | |
| y = height - 50 | |
| c.save() | |
| os.remove(temp_img_path) | |
| return output_pdf_path | |
| except Exception as e: | |
| print(f"An error occurred: {e}") | |
| # Clean up temporary directory on error | |
| # shutil.rmtree(temp_output_dir) | |
| return None | |
| # Gradio Interface | |
| # The `@GPU` decorator is used here to ensure this function runs on a GPU. | |
| def process_file_for_gradio(file): | |
| # This wrapper function is needed because Gradio's `File` component passes a temp file. | |
| # We call our main processing function and return the path to the output PDF. | |
| output_path = process_file_and_create_pdf(file) | |
| if output_path is None: | |
| return None | |
| return output_path | |
| demo = gr.Interface( | |
| fn=process_file_for_gradio, | |
| inputs=gr.File(label="Upload an Image (PNG, JPG) or a PDF", file_types=['.png', '.jpg', '.jpeg', '.pdf']), | |
| outputs=gr.File(label="Download OCR Results PDF"), | |
| title="OCR App with PaddleOCR and TrOCR", | |
| description="Upload an image or a multi-page PDF to get an output PDF with the recognized text from each page.", | |
| examples=[ | |
| # Here you can provide paths to example files in your repo | |
| # "example.png", | |
| # "example.pdf" | |
| ] | |
| ) | |
| if __name__ == "__main__": | |
| # You will need to set the hardware configuration in the `README.md` file | |
| # of your Hugging Face Space for the GPU to be available. | |
| demo.launch() | |