Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	| # Import the GPU decorator for ZeroGPU Spaces | |
| from spaces import GPU | |
| import os | |
| import cv2 | |
| import numpy as np | |
| import torch | |
| import tempfile | |
| import shutil | |
| import gradio as gr | |
| from PIL import Image | |
| from pdf2image import convert_from_path | |
| from reportlab.lib.pagesizes import letter | |
| from reportlab.pdfgen import canvas | |
| from transformers import TrOCRProcessor, VisionEncoderDecoderModel | |
| from paddleocr import PaddleOCR | |
| import logging | |
| # Disable PaddleOCR logging for a cleaner output | |
| logging.disable(logging.WARNING) | |
| # Set the GPU device if available | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Using device: {device}") | |
| # --- MODEL LOADING --- | |
| # Load models globally so they are only initialized once when the app starts. | |
| # Initialize the PaddleOCR detection model | |
| print("Initializing PaddleOCR text detection model...") | |
| try: | |
| det_model = PaddleOCR(use_angle_cls=False, lang='en', use_gpu=torch.cuda.is_available(), show_log=False) | |
| except Exception as e: | |
| print(f"Error initializing PaddleOCR: {e}") | |
| det_model = None | |
| # Initialize the TrOCR recognition model and processor | |
| print("Initializing TrOCR text recognition model...") | |
| try: | |
| trocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-handwritten") | |
| trocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-large-handwritten") | |
| trocr_model.eval() | |
| trocr_model.to(device) | |
| except Exception as e: | |
| print(f"Error initializing TrOCR: {e}") | |
| trocr_model = None | |
| trocr_processor = None | |
| # Helper function to save a temp image | |
| def save_temp_image(img): | |
| """Save an image array to a temporary file and return the path.""" | |
| temp_fd, temp_path = tempfile.mkstemp(suffix='.png') | |
| cv2.imwrite(temp_path, img) | |
| os.close(temp_fd) | |
| return temp_path | |
| def process_image_page(img): | |
| """ | |
| Process a single image to detect polygons, crop regions, and recognize text. | |
| Returns a list of [box, text] for each cropped region and the original PIL image. | |
| """ | |
| if det_model is None or trocr_model is None: | |
| raise RuntimeError("OCR models are not loaded. Please check logs for errors.") | |
| # Convert OpenCV image (BGR numpy array) to PIL Image (RGB) | |
| original_pil_image = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) | |
| # PaddleOCR's predict method takes a file path, so we'll save the image to a temp file | |
| temp_image_path = save_temp_image(img) | |
| # Detect polygons using PaddleOCR | |
| ocr_result = det_model.ocr(temp_image_path) | |
| os.remove(temp_image_path) | |
| arr = [] | |
| # The OCR result is a list of lists, where each inner list represents a text line. | |
| if ocr_result and ocr_result[0]: | |
| for line in ocr_result[0]: | |
| arr.append(line[0]) | |
| print(f"Detected {len(arr)} lines in this page.") | |
| cropped_images = [] | |
| for box in arr: | |
| box = np.array(box, dtype=np.float32) | |
| # Compute width and height of the straightened image | |
| width_a = np.linalg.norm(box[0] - box[1]) | |
| width_b = np.linalg.norm(box[2] - box[3]) | |
| height_a = np.linalg.norm(box[0] - box[3]) | |
| height_b = np.linalg.norm(box[1] - box[2]) | |
| width = int(max(width_a, width_b)) | |
| height = int(max(height_a, height_b)) | |
| dst_rect = np.array([ | |
| [0, 0], | |
| [width - 1, 0], | |
| [width - 1, height - 1], | |
| [0, height - 1] | |
| ], dtype=np.float32) | |
| # Perspective transform | |
| M = cv2.getPerspectiveTransform(box, dst_rect) | |
| warped = cv2.warpPerspective(img, M, (width, height)) | |
| cropped_images.append(warped) | |
| # Reverse cropped images and corresponding boxes to match reading order | |
| cropped_images.reverse() | |
| arr.reverse() | |
| # Text recognition with TrOCR | |
| results = [] | |
| for i, crop in enumerate(cropped_images): | |
| image_pil = Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)) | |
| pixel_values = trocr_processor(images=image_pil, return_tensors="pt").pixel_values.to(device) | |
| with torch.no_grad(): | |
| generated_ids = trocr_model.generate(pixel_values, max_new_tokens=64) | |
| generated_text = trocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| results.append([arr[i], generated_text]) | |
| print(f"Recognized: {generated_text}") | |
| return results, original_pil_image | |
| def process_file_and_create_pdf(file): | |
| """ | |
| Main function to process a file (image or PDF) and return a path to a new PDF. | |
| This function will ensure the temporary output directory is cleaned up safely. | |
| The @GPU decorator ensures this function is run on the GPU. | |
| """ | |
| if file is None: | |
| return None, None | |
| temp_output_dir = tempfile.mkdtemp() | |
| output_pdf_path = os.path.join(temp_output_dir, "ocr_results.pdf") | |
| input_image_for_display = None | |
| try: | |
| if file.name.lower().endswith('.pdf'): | |
| # Convert PDF to images | |
| print(f"Converting PDF {file.name} to images...") | |
| images = convert_from_path(file.name, dpi=300) | |
| if images: | |
| # Set the first page as the image to display | |
| input_image_for_display = images[0] | |
| c = canvas.Canvas(output_pdf_path, pagesize=letter) | |
| width, height = letter | |
| for page_num, page in enumerate(images): | |
| print(f"\nProcessing page {page_num + 1}") | |
| img_cv = cv2.cvtColor(np.array(page), cv2.COLOR_RGB2BGR) | |
| # Check if the background is dark and text is light (simple heuristic) | |
| gray_image = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY) | |
| avg_intensity = np.mean(gray_image) | |
| if avg_intensity < 100: | |
| print("Inverting colors for dark background.") | |
| img_cv = cv2.bitwise_not(img_cv) | |
| results, _ = process_image_page(img_cv) | |
| c.setFont("Helvetica-Bold", 14) | |
| c.drawString(50, height - 40, f"Page {page_num + 1} - OCR Results") | |
| y = height - 60 | |
| c.setFont("Helvetica", 12) | |
| for _, text in results: | |
| c.drawString(50, y, text) | |
| y -= 15 | |
| if y < 50: | |
| c.showPage() | |
| c.setFont("Helvetica-Bold", 14) | |
| c.drawString(50, height - 40, f"Page {page_num + 1} (cont.) - OCR Results") | |
| y = height - 60 | |
| c.showPage() | |
| c.save() | |
| else: # Handle single image file | |
| img_cv = cv2.imread(file.name) | |
| if img_cv is None: | |
| raise ValueError("Failed to load image.") | |
| input_image_for_display = Image.fromarray(cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB)) | |
| # Check if the background is dark and text is light (simple heuristic) | |
| gray_image = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY) | |
| avg_intensity = np.mean(gray_image) | |
| if avg_intensity < 100: | |
| print("Inverting colors for dark background.") | |
| img_cv = cv2.bitwise_not(img_cv) | |
| results, original_image = process_image_page(img_cv) | |
| c = canvas.Canvas(output_pdf_path, pagesize=letter) | |
| width, height = letter | |
| c.setFont("Helvetica-Bold", 14) | |
| c.drawString(50, height - 40, "Image OCR Results") | |
| temp_img_path = os.path.join(temp_output_dir, "original_image.png") | |
| original_image.save(temp_img_path) | |
| c.drawImage(temp_img_path, 50, height - 300, width=200, preserveAspectRatio=True) | |
| os.remove(temp_img_path) | |
| y = height - 350 | |
| c.setFont("Helvetica", 12) | |
| for _, text in results: | |
| c.drawString(50, y, text) | |
| y -= 15 | |
| if y < 50: | |
| c.showPage() | |
| c.setFont("Helvetica", 12) | |
| y = height - 50 | |
| c.save() | |
| print(f"Generated PDF path: {output_pdf_path}") | |
| return output_pdf_path, input_image_for_display | |
| except Exception as e: | |
| print(f"An error occurred: {e}") | |
| # Return None, None on error | |
| return None, None | |
| finally: | |
| # Ensure temporary directory is cleaned up after the function returns | |
| if os.path.exists(temp_output_dir): | |
| print(f"Cleaning up temporary directory: {temp_output_dir}") | |
| shutil.rmtree(temp_output_dir) | |
| # Gradio Interface | |
| def process_file_for_gradio(image_file, pdf_file): | |
| """ | |
| Wrapper function for Gradio interface with separate inputs. | |
| This function checks which input was provided and calls the main | |
| processing logic accordingly. | |
| """ | |
| if image_file is not None: | |
| # The gr.Image component returns a PIL Image object | |
| # We need to save it to a temporary file for the main function | |
| temp_dir = tempfile.mkdtemp() | |
| image_path = os.path.join(temp_dir, "uploaded_image.png") | |
| image_file.save(image_path) | |
| # Create a mock file object to be compatible with the main function | |
| class MockFile: | |
| def __init__(self, name): | |
| self.name = name | |
| mock_file = MockFile(image_path) | |
| output_path, input_image = process_file_and_create_pdf(mock_file) | |
| shutil.rmtree(temp_dir) | |
| return output_path, input_image | |
| elif pdf_file is not None: | |
| # The gr.File component passes a temporary file object directly | |
| output_path, input_image = process_file_and_create_pdf(pdf_file) | |
| return output_path, input_image | |
| else: | |
| return None, None | |
| demo = gr.Interface( | |
| fn=process_file_for_gradio, | |
| inputs=[ | |
| gr.Image(label="Upload an Image", type="pil"), | |
| gr.File(label="Upload a PDF", file_types=['.pdf']) | |
| ], | |
| outputs=[ | |
| gr.File(label="Download OCR Results PDF", interactive=False, visible=True), | |
| gr.Image(label="Uploaded File Preview", interactive=False) | |
| ], | |
| title="OCR App with PaddleOCR and TrOCR", | |
| description="Upload an image or a multi-page PDF to get an output PDF with the recognized text from each page. The output PDF will be downloaded automatically.", | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |