imperiusrex commited on
Commit
7f16886
·
verified ·
1 Parent(s): f40a04a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -27
app.py CHANGED
@@ -1,6 +1,3 @@
1
- # Import the GPU decorator for ZeroGPU Spaces
2
- # This will be a no-op if the space is not configured for ZeroGPU
3
- # but it is required for the specified hardware to work correctly.
4
  from spaces import GPU
5
 
6
  import os
@@ -8,13 +5,19 @@ import cv2
8
  import numpy as np
9
  import torch
10
  import tempfile
 
11
  import gradio as gr
12
  from PIL import Image
13
  from pdf2image import convert_from_path
14
  from reportlab.lib.pagesizes import letter
15
  from reportlab.pdfgen import canvas
16
  from transformers import TrOCRProcessor, VisionEncoderDecoderModel
17
- from paddleocr import PaddleOCR, TextDetection
 
 
 
 
 
18
 
19
  # Set the GPU device if available
20
  # The `spaces.GPU` decorator handles the dynamic GPU allocation, but we still need to
@@ -80,8 +83,10 @@ def process_image_page(img):
80
  arr = []
81
  # The OCR result is a list of lists, where each inner list represents a text line.
82
  # The first element is the bounding box coordinates.
83
- for line in ocr_result[0]:
84
- arr.append(line[0])
 
 
85
 
86
  print(f"Detected {len(arr)} lines in this page.")
87
 
@@ -110,7 +115,7 @@ def process_image_page(img):
110
  warped = cv2.warpPerspective(img, M, (width, height))
111
  cropped_images.append(warped)
112
 
113
- # Reverse cropped images and corresponding boxes
114
  cropped_images.reverse()
115
  arr.reverse()
116
 
@@ -135,27 +140,38 @@ def process_file_and_create_pdf(file):
135
  The @GPU decorator ensures this function is run on the GPU.
136
  """
137
  if file is None:
138
- return None, "Please upload a file."
139
 
140
  temp_output_dir = tempfile.mkdtemp()
141
  output_pdf_path = os.path.join(temp_output_dir, "ocr_results.pdf")
 
 
142
 
143
  try:
144
  if file.name.lower().endswith('.pdf'):
145
  # Convert PDF to images
146
  print(f"Converting PDF {file.name} to images...")
147
- # Use `poppler_path` if poppler is installed on the system, otherwise
148
- # it might be necessary to install it via a `packages.txt` file.
149
- # Here we assume it's available.
150
  images = convert_from_path(file.name, dpi=300)
151
-
 
 
 
 
152
  c = canvas.Canvas(output_pdf_path, pagesize=letter)
153
  width, height = letter
154
 
155
  for page_num, page in enumerate(images):
156
  print(f"\nProcessing page {page_num + 1}")
157
  img_cv = cv2.cvtColor(np.array(page), cv2.COLOR_RGB2BGR)
158
- results, original_image = process_image_page(img_cv)
 
 
 
 
 
 
 
 
159
 
160
  c.setFont("Helvetica-Bold", 14)
161
  c.drawString(50, height - 40, f"Page {page_num + 1} - OCR Results")
@@ -178,6 +194,15 @@ def process_file_and_create_pdf(file):
178
  if img_cv is None:
179
  raise ValueError("Failed to load image.")
180
 
 
 
 
 
 
 
 
 
 
181
  results, original_image = process_image_page(img_cv)
182
 
183
  c = canvas.Canvas(output_pdf_path, pagesize=letter)
@@ -185,14 +210,11 @@ def process_file_and_create_pdf(file):
185
  c.setFont("Helvetica-Bold", 14)
186
  c.drawString(50, height - 40, "Image OCR Results")
187
 
188
- # The input file from Gradio is a temp file that will be cleaned up.
189
- # We can't display it directly in the PDF from its path.
190
- # To draw it in the PDF, we save it to a new temporary path.
191
  temp_img_path = os.path.join(temp_output_dir, "original_image.png")
192
  original_image.save(temp_img_path)
193
-
194
- # Draw the image on the PDF
195
  c.drawImage(temp_img_path, 50, height - 300, width=200, preserveAspectRatio=True)
 
196
 
197
  y = height - 350
198
  c.setFont("Helvetica", 12)
@@ -204,15 +226,15 @@ def process_file_and_create_pdf(file):
204
  c.setFont("Helvetica", 12)
205
  y = height - 50
206
  c.save()
207
- os.remove(temp_img_path)
208
 
209
- return output_pdf_path
210
 
211
  except Exception as e:
212
  print(f"An error occurred: {e}")
213
  # Clean up temporary directory on error
214
- # shutil.rmtree(temp_output_dir)
215
- return None
 
216
 
217
  # Gradio Interface
218
  # The `@GPU` decorator is used here to ensure this function runs on a GPU.
@@ -220,17 +242,20 @@ def process_file_and_create_pdf(file):
220
  def process_file_for_gradio(file):
221
  # This wrapper function is needed because Gradio's `File` component passes a temp file.
222
  # We call our main processing function and return the path to the output PDF.
223
- output_path = process_file_and_create_pdf(file)
224
  if output_path is None:
225
- return None
226
- return output_path
227
 
228
  demo = gr.Interface(
229
  fn=process_file_for_gradio,
230
  inputs=gr.File(label="Upload an Image (PNG, JPG) or a PDF", file_types=['.png', '.jpg', '.jpeg', '.pdf']),
231
- outputs=gr.File(label="Download OCR Results PDF"),
 
 
 
232
  title="OCR App with PaddleOCR and TrOCR",
233
- description="Upload an image or a multi-page PDF to get an output PDF with the recognized text from each page.",
234
  examples=[
235
  # Here you can provide paths to example files in your repo
236
  # "example.png",
 
 
 
 
1
  from spaces import GPU
2
 
3
  import os
 
5
  import numpy as np
6
  import torch
7
  import tempfile
8
+ import shutil
9
  import gradio as gr
10
  from PIL import Image
11
  from pdf2image import convert_from_path
12
  from reportlab.lib.pagesizes import letter
13
  from reportlab.pdfgen import canvas
14
  from transformers import TrOCRProcessor, VisionEncoderDecoderModel
15
+ from paddleocr import PaddleOCR
16
+ from paddleocr.ppocr.utils.logging import disable_logger
17
+ from IPython.display import display
18
+
19
+ # Disable PaddleOCR logging for a cleaner output
20
+ disable_logger()
21
 
22
  # Set the GPU device if available
23
  # The `spaces.GPU` decorator handles the dynamic GPU allocation, but we still need to
 
83
  arr = []
84
  # The OCR result is a list of lists, where each inner list represents a text line.
85
  # The first element is the bounding box coordinates.
86
+ # Check if ocr_result is not None and has at least one element
87
+ if ocr_result and ocr_result[0]:
88
+ for line in ocr_result[0]:
89
+ arr.append(line[0])
90
 
91
  print(f"Detected {len(arr)} lines in this page.")
92
 
 
115
  warped = cv2.warpPerspective(img, M, (width, height))
116
  cropped_images.append(warped)
117
 
118
+ # Reverse cropped images and corresponding boxes to match reading order
119
  cropped_images.reverse()
120
  arr.reverse()
121
 
 
140
  The @GPU decorator ensures this function is run on the GPU.
141
  """
142
  if file is None:
143
+ return None, None
144
 
145
  temp_output_dir = tempfile.mkdtemp()
146
  output_pdf_path = os.path.join(temp_output_dir, "ocr_results.pdf")
147
+
148
+ input_image_for_display = None
149
 
150
  try:
151
  if file.name.lower().endswith('.pdf'):
152
  # Convert PDF to images
153
  print(f"Converting PDF {file.name} to images...")
 
 
 
154
  images = convert_from_path(file.name, dpi=300)
155
+
156
+ if images:
157
+ # Set the first page as the image to display
158
+ input_image_for_display = images[0]
159
+
160
  c = canvas.Canvas(output_pdf_path, pagesize=letter)
161
  width, height = letter
162
 
163
  for page_num, page in enumerate(images):
164
  print(f"\nProcessing page {page_num + 1}")
165
  img_cv = cv2.cvtColor(np.array(page), cv2.COLOR_RGB2BGR)
166
+
167
+ # Check if the background is dark and text is light (simple heuristic)
168
+ gray_image = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
169
+ avg_intensity = np.mean(gray_image)
170
+ if avg_intensity < 100: # Example threshold
171
+ print("Inverting colors for dark background.")
172
+ img_cv = cv2.bitwise_not(img_cv)
173
+
174
+ results, _ = process_image_page(img_cv)
175
 
176
  c.setFont("Helvetica-Bold", 14)
177
  c.drawString(50, height - 40, f"Page {page_num + 1} - OCR Results")
 
194
  if img_cv is None:
195
  raise ValueError("Failed to load image.")
196
 
197
+ input_image_for_display = Image.fromarray(cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB))
198
+
199
+ # Check if the background is dark and text is light (simple heuristic)
200
+ gray_image = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
201
+ avg_intensity = np.mean(gray_image)
202
+ if avg_intensity < 100: # Example threshold
203
+ print("Inverting colors for dark background.")
204
+ img_cv = cv2.bitwise_not(img_cv)
205
+
206
  results, original_image = process_image_page(img_cv)
207
 
208
  c = canvas.Canvas(output_pdf_path, pagesize=letter)
 
210
  c.setFont("Helvetica-Bold", 14)
211
  c.drawString(50, height - 40, "Image OCR Results")
212
 
213
+ # Draw the original image on the PDF for context
 
 
214
  temp_img_path = os.path.join(temp_output_dir, "original_image.png")
215
  original_image.save(temp_img_path)
 
 
216
  c.drawImage(temp_img_path, 50, height - 300, width=200, preserveAspectRatio=True)
217
+ os.remove(temp_img_path)
218
 
219
  y = height - 350
220
  c.setFont("Helvetica", 12)
 
226
  c.setFont("Helvetica", 12)
227
  y = height - 50
228
  c.save()
 
229
 
230
+ return output_pdf_path, input_image_for_display
231
 
232
  except Exception as e:
233
  print(f"An error occurred: {e}")
234
  # Clean up temporary directory on error
235
+ if os.path.exists(temp_output_dir):
236
+ shutil.rmtree(temp_output_dir)
237
+ return None, None
238
 
239
  # Gradio Interface
240
  # The `@GPU` decorator is used here to ensure this function runs on a GPU.
 
242
  def process_file_for_gradio(file):
243
  # This wrapper function is needed because Gradio's `File` component passes a temp file.
244
  # We call our main processing function and return the path to the output PDF.
245
+ output_path, input_image = process_file_and_create_pdf(file)
246
  if output_path is None:
247
+ return None, None
248
+ return output_path, input_image
249
 
250
  demo = gr.Interface(
251
  fn=process_file_for_gradio,
252
  inputs=gr.File(label="Upload an Image (PNG, JPG) or a PDF", file_types=['.png', '.jpg', '.jpeg', '.pdf']),
253
+ outputs=[
254
+ gr.File(label="Download OCR Results PDF", interactive=False, visible=True),
255
+ gr.Image(label="Uploaded Image Preview", interactive=False)
256
+ ],
257
  title="OCR App with PaddleOCR and TrOCR",
258
+ description="Upload an image or a multi-page PDF to get an output PDF with the recognized text from each page. The output PDF will be downloaded automatically.",
259
  examples=[
260
  # Here you can provide paths to example files in your repo
261
  # "example.png",