Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,3 @@
|
|
| 1 |
-
# Import the GPU decorator for ZeroGPU Spaces
|
| 2 |
-
# This will be a no-op if the space is not configured for ZeroGPU
|
| 3 |
-
# but it is required for the specified hardware to work correctly.
|
| 4 |
from spaces import GPU
|
| 5 |
|
| 6 |
import os
|
|
@@ -8,13 +5,19 @@ import cv2
|
|
| 8 |
import numpy as np
|
| 9 |
import torch
|
| 10 |
import tempfile
|
|
|
|
| 11 |
import gradio as gr
|
| 12 |
from PIL import Image
|
| 13 |
from pdf2image import convert_from_path
|
| 14 |
from reportlab.lib.pagesizes import letter
|
| 15 |
from reportlab.pdfgen import canvas
|
| 16 |
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
|
| 17 |
-
from paddleocr import PaddleOCR
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
# Set the GPU device if available
|
| 20 |
# The `spaces.GPU` decorator handles the dynamic GPU allocation, but we still need to
|
|
@@ -80,8 +83,10 @@ def process_image_page(img):
|
|
| 80 |
arr = []
|
| 81 |
# The OCR result is a list of lists, where each inner list represents a text line.
|
| 82 |
# The first element is the bounding box coordinates.
|
| 83 |
-
|
| 84 |
-
|
|
|
|
|
|
|
| 85 |
|
| 86 |
print(f"Detected {len(arr)} lines in this page.")
|
| 87 |
|
|
@@ -110,7 +115,7 @@ def process_image_page(img):
|
|
| 110 |
warped = cv2.warpPerspective(img, M, (width, height))
|
| 111 |
cropped_images.append(warped)
|
| 112 |
|
| 113 |
-
# Reverse cropped images and corresponding boxes
|
| 114 |
cropped_images.reverse()
|
| 115 |
arr.reverse()
|
| 116 |
|
|
@@ -135,27 +140,38 @@ def process_file_and_create_pdf(file):
|
|
| 135 |
The @GPU decorator ensures this function is run on the GPU.
|
| 136 |
"""
|
| 137 |
if file is None:
|
| 138 |
-
return None,
|
| 139 |
|
| 140 |
temp_output_dir = tempfile.mkdtemp()
|
| 141 |
output_pdf_path = os.path.join(temp_output_dir, "ocr_results.pdf")
|
|
|
|
|
|
|
| 142 |
|
| 143 |
try:
|
| 144 |
if file.name.lower().endswith('.pdf'):
|
| 145 |
# Convert PDF to images
|
| 146 |
print(f"Converting PDF {file.name} to images...")
|
| 147 |
-
# Use `poppler_path` if poppler is installed on the system, otherwise
|
| 148 |
-
# it might be necessary to install it via a `packages.txt` file.
|
| 149 |
-
# Here we assume it's available.
|
| 150 |
images = convert_from_path(file.name, dpi=300)
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
c = canvas.Canvas(output_pdf_path, pagesize=letter)
|
| 153 |
width, height = letter
|
| 154 |
|
| 155 |
for page_num, page in enumerate(images):
|
| 156 |
print(f"\nProcessing page {page_num + 1}")
|
| 157 |
img_cv = cv2.cvtColor(np.array(page), cv2.COLOR_RGB2BGR)
|
| 158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
|
| 160 |
c.setFont("Helvetica-Bold", 14)
|
| 161 |
c.drawString(50, height - 40, f"Page {page_num + 1} - OCR Results")
|
|
@@ -178,6 +194,15 @@ def process_file_and_create_pdf(file):
|
|
| 178 |
if img_cv is None:
|
| 179 |
raise ValueError("Failed to load image.")
|
| 180 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
results, original_image = process_image_page(img_cv)
|
| 182 |
|
| 183 |
c = canvas.Canvas(output_pdf_path, pagesize=letter)
|
|
@@ -185,14 +210,11 @@ def process_file_and_create_pdf(file):
|
|
| 185 |
c.setFont("Helvetica-Bold", 14)
|
| 186 |
c.drawString(50, height - 40, "Image OCR Results")
|
| 187 |
|
| 188 |
-
#
|
| 189 |
-
# We can't display it directly in the PDF from its path.
|
| 190 |
-
# To draw it in the PDF, we save it to a new temporary path.
|
| 191 |
temp_img_path = os.path.join(temp_output_dir, "original_image.png")
|
| 192 |
original_image.save(temp_img_path)
|
| 193 |
-
|
| 194 |
-
# Draw the image on the PDF
|
| 195 |
c.drawImage(temp_img_path, 50, height - 300, width=200, preserveAspectRatio=True)
|
|
|
|
| 196 |
|
| 197 |
y = height - 350
|
| 198 |
c.setFont("Helvetica", 12)
|
|
@@ -204,15 +226,15 @@ def process_file_and_create_pdf(file):
|
|
| 204 |
c.setFont("Helvetica", 12)
|
| 205 |
y = height - 50
|
| 206 |
c.save()
|
| 207 |
-
os.remove(temp_img_path)
|
| 208 |
|
| 209 |
-
return output_pdf_path
|
| 210 |
|
| 211 |
except Exception as e:
|
| 212 |
print(f"An error occurred: {e}")
|
| 213 |
# Clean up temporary directory on error
|
| 214 |
-
|
| 215 |
-
|
|
|
|
| 216 |
|
| 217 |
# Gradio Interface
|
| 218 |
# The `@GPU` decorator is used here to ensure this function runs on a GPU.
|
|
@@ -220,17 +242,20 @@ def process_file_and_create_pdf(file):
|
|
| 220 |
def process_file_for_gradio(file):
|
| 221 |
# This wrapper function is needed because Gradio's `File` component passes a temp file.
|
| 222 |
# We call our main processing function and return the path to the output PDF.
|
| 223 |
-
output_path = process_file_and_create_pdf(file)
|
| 224 |
if output_path is None:
|
| 225 |
-
return None
|
| 226 |
-
return output_path
|
| 227 |
|
| 228 |
demo = gr.Interface(
|
| 229 |
fn=process_file_for_gradio,
|
| 230 |
inputs=gr.File(label="Upload an Image (PNG, JPG) or a PDF", file_types=['.png', '.jpg', '.jpeg', '.pdf']),
|
| 231 |
-
outputs=
|
|
|
|
|
|
|
|
|
|
| 232 |
title="OCR App with PaddleOCR and TrOCR",
|
| 233 |
-
description="Upload an image or a multi-page PDF to get an output PDF with the recognized text from each page.",
|
| 234 |
examples=[
|
| 235 |
# Here you can provide paths to example files in your repo
|
| 236 |
# "example.png",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from spaces import GPU
|
| 2 |
|
| 3 |
import os
|
|
|
|
| 5 |
import numpy as np
|
| 6 |
import torch
|
| 7 |
import tempfile
|
| 8 |
+
import shutil
|
| 9 |
import gradio as gr
|
| 10 |
from PIL import Image
|
| 11 |
from pdf2image import convert_from_path
|
| 12 |
from reportlab.lib.pagesizes import letter
|
| 13 |
from reportlab.pdfgen import canvas
|
| 14 |
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
|
| 15 |
+
from paddleocr import PaddleOCR
|
| 16 |
+
from paddleocr.ppocr.utils.logging import disable_logger
|
| 17 |
+
from IPython.display import display
|
| 18 |
+
|
| 19 |
+
# Disable PaddleOCR logging for a cleaner output
|
| 20 |
+
disable_logger()
|
| 21 |
|
| 22 |
# Set the GPU device if available
|
| 23 |
# The `spaces.GPU` decorator handles the dynamic GPU allocation, but we still need to
|
|
|
|
| 83 |
arr = []
|
| 84 |
# The OCR result is a list of lists, where each inner list represents a text line.
|
| 85 |
# The first element is the bounding box coordinates.
|
| 86 |
+
# Check if ocr_result is not None and has at least one element
|
| 87 |
+
if ocr_result and ocr_result[0]:
|
| 88 |
+
for line in ocr_result[0]:
|
| 89 |
+
arr.append(line[0])
|
| 90 |
|
| 91 |
print(f"Detected {len(arr)} lines in this page.")
|
| 92 |
|
|
|
|
| 115 |
warped = cv2.warpPerspective(img, M, (width, height))
|
| 116 |
cropped_images.append(warped)
|
| 117 |
|
| 118 |
+
# Reverse cropped images and corresponding boxes to match reading order
|
| 119 |
cropped_images.reverse()
|
| 120 |
arr.reverse()
|
| 121 |
|
|
|
|
| 140 |
The @GPU decorator ensures this function is run on the GPU.
|
| 141 |
"""
|
| 142 |
if file is None:
|
| 143 |
+
return None, None
|
| 144 |
|
| 145 |
temp_output_dir = tempfile.mkdtemp()
|
| 146 |
output_pdf_path = os.path.join(temp_output_dir, "ocr_results.pdf")
|
| 147 |
+
|
| 148 |
+
input_image_for_display = None
|
| 149 |
|
| 150 |
try:
|
| 151 |
if file.name.lower().endswith('.pdf'):
|
| 152 |
# Convert PDF to images
|
| 153 |
print(f"Converting PDF {file.name} to images...")
|
|
|
|
|
|
|
|
|
|
| 154 |
images = convert_from_path(file.name, dpi=300)
|
| 155 |
+
|
| 156 |
+
if images:
|
| 157 |
+
# Set the first page as the image to display
|
| 158 |
+
input_image_for_display = images[0]
|
| 159 |
+
|
| 160 |
c = canvas.Canvas(output_pdf_path, pagesize=letter)
|
| 161 |
width, height = letter
|
| 162 |
|
| 163 |
for page_num, page in enumerate(images):
|
| 164 |
print(f"\nProcessing page {page_num + 1}")
|
| 165 |
img_cv = cv2.cvtColor(np.array(page), cv2.COLOR_RGB2BGR)
|
| 166 |
+
|
| 167 |
+
# Check if the background is dark and text is light (simple heuristic)
|
| 168 |
+
gray_image = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
|
| 169 |
+
avg_intensity = np.mean(gray_image)
|
| 170 |
+
if avg_intensity < 100: # Example threshold
|
| 171 |
+
print("Inverting colors for dark background.")
|
| 172 |
+
img_cv = cv2.bitwise_not(img_cv)
|
| 173 |
+
|
| 174 |
+
results, _ = process_image_page(img_cv)
|
| 175 |
|
| 176 |
c.setFont("Helvetica-Bold", 14)
|
| 177 |
c.drawString(50, height - 40, f"Page {page_num + 1} - OCR Results")
|
|
|
|
| 194 |
if img_cv is None:
|
| 195 |
raise ValueError("Failed to load image.")
|
| 196 |
|
| 197 |
+
input_image_for_display = Image.fromarray(cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB))
|
| 198 |
+
|
| 199 |
+
# Check if the background is dark and text is light (simple heuristic)
|
| 200 |
+
gray_image = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
|
| 201 |
+
avg_intensity = np.mean(gray_image)
|
| 202 |
+
if avg_intensity < 100: # Example threshold
|
| 203 |
+
print("Inverting colors for dark background.")
|
| 204 |
+
img_cv = cv2.bitwise_not(img_cv)
|
| 205 |
+
|
| 206 |
results, original_image = process_image_page(img_cv)
|
| 207 |
|
| 208 |
c = canvas.Canvas(output_pdf_path, pagesize=letter)
|
|
|
|
| 210 |
c.setFont("Helvetica-Bold", 14)
|
| 211 |
c.drawString(50, height - 40, "Image OCR Results")
|
| 212 |
|
| 213 |
+
# Draw the original image on the PDF for context
|
|
|
|
|
|
|
| 214 |
temp_img_path = os.path.join(temp_output_dir, "original_image.png")
|
| 215 |
original_image.save(temp_img_path)
|
|
|
|
|
|
|
| 216 |
c.drawImage(temp_img_path, 50, height - 300, width=200, preserveAspectRatio=True)
|
| 217 |
+
os.remove(temp_img_path)
|
| 218 |
|
| 219 |
y = height - 350
|
| 220 |
c.setFont("Helvetica", 12)
|
|
|
|
| 226 |
c.setFont("Helvetica", 12)
|
| 227 |
y = height - 50
|
| 228 |
c.save()
|
|
|
|
| 229 |
|
| 230 |
+
return output_pdf_path, input_image_for_display
|
| 231 |
|
| 232 |
except Exception as e:
|
| 233 |
print(f"An error occurred: {e}")
|
| 234 |
# Clean up temporary directory on error
|
| 235 |
+
if os.path.exists(temp_output_dir):
|
| 236 |
+
shutil.rmtree(temp_output_dir)
|
| 237 |
+
return None, None
|
| 238 |
|
| 239 |
# Gradio Interface
|
| 240 |
# The `@GPU` decorator is used here to ensure this function runs on a GPU.
|
|
|
|
| 242 |
def process_file_for_gradio(file):
|
| 243 |
# This wrapper function is needed because Gradio's `File` component passes a temp file.
|
| 244 |
# We call our main processing function and return the path to the output PDF.
|
| 245 |
+
output_path, input_image = process_file_and_create_pdf(file)
|
| 246 |
if output_path is None:
|
| 247 |
+
return None, None
|
| 248 |
+
return output_path, input_image
|
| 249 |
|
| 250 |
demo = gr.Interface(
|
| 251 |
fn=process_file_for_gradio,
|
| 252 |
inputs=gr.File(label="Upload an Image (PNG, JPG) or a PDF", file_types=['.png', '.jpg', '.jpeg', '.pdf']),
|
| 253 |
+
outputs=[
|
| 254 |
+
gr.File(label="Download OCR Results PDF", interactive=False, visible=True),
|
| 255 |
+
gr.Image(label="Uploaded Image Preview", interactive=False)
|
| 256 |
+
],
|
| 257 |
title="OCR App with PaddleOCR and TrOCR",
|
| 258 |
+
description="Upload an image or a multi-page PDF to get an output PDF with the recognized text from each page. The output PDF will be downloaded automatically.",
|
| 259 |
examples=[
|
| 260 |
# Here you can provide paths to example files in your repo
|
| 261 |
# "example.png",
|