Yaz Hobooti commited on
Commit
b89e133
·
1 Parent(s): cd4f997

Add PyMuPDF fallback and comprehensive poppler packages for PDF processing

Browse files
Files changed (3) hide show
  1. apt.txt +3 -0
  2. pdf_comparator.py +27 -3
  3. requirements.txt +1 -0
apt.txt CHANGED
@@ -2,5 +2,8 @@ poppler-utils
2
  poppler-data
3
  libpoppler-dev
4
  libpoppler-cpp-dev
 
 
5
  tesseract-ocr
6
  libzbar0
 
 
2
  poppler-data
3
  libpoppler-dev
4
  libpoppler-cpp-dev
5
+ libpoppler-glib-dev
6
+ libpoppler-qt5-dev
7
  tesseract-ocr
8
  libzbar0
9
+ ghostscript
pdf_comparator.py CHANGED
@@ -16,6 +16,14 @@ from skimage.measure import label, regionprops
16
  from skimage.morphology import dilation, rectangle
17
  import gradio as gr
18
 
 
 
 
 
 
 
 
 
19
  # Optional features
20
  try:
21
  import pytesseract
@@ -49,7 +57,7 @@ def _is_pdf(path: str) -> bool:
49
 
50
  def load_first_page(path: str, dpi: int = 300) -> Image.Image:
51
  if _is_pdf(path):
52
- # Try multiple poppler paths
53
  poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None]
54
 
55
  for poppler_path in poppler_paths:
@@ -64,9 +72,25 @@ def load_first_page(path: str, dpi: int = 300) -> Image.Image:
64
 
65
  return imgs[0].convert("RGB")
66
  except Exception as e:
67
- if poppler_path is None: # Last attempt failed
68
- raise ValueError(f"Failed to convert PDF to image with all poppler paths. Last error: {str(e)}. Make sure poppler-utils is installed.")
69
  continue # Try next path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  raise ValueError(f"No pages in PDF: {path}")
72
  return Image.open(path).convert("RGB")
 
16
  from skimage.morphology import dilation, rectangle
17
  import gradio as gr
18
 
19
+ # Alternative PDF processing
20
+ try:
21
+ import fitz # PyMuPDF
22
+ HAS_PYMUPDF = True
23
+ except Exception:
24
+ fitz = None
25
+ HAS_PYMUPDF = False
26
+
27
  # Optional features
28
  try:
29
  import pytesseract
 
57
 
58
  def load_first_page(path: str, dpi: int = 300) -> Image.Image:
59
  if _is_pdf(path):
60
+ # Try pdf2image with multiple poppler paths first
61
  poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None]
62
 
63
  for poppler_path in poppler_paths:
 
72
 
73
  return imgs[0].convert("RGB")
74
  except Exception as e:
75
+ if poppler_path is None: # All pdf2image attempts failed
76
+ break
77
  continue # Try next path
78
+
79
+ # Fallback to PyMuPDF if pdf2image fails
80
+ if HAS_PYMUPDF:
81
+ try:
82
+ doc = fitz.open(path)
83
+ page = doc[0] # First page
84
+ mat = fitz.Matrix(dpi/72, dpi/72) # Scale factor for DPI
85
+ pix = page.get_pixmap(matrix=mat)
86
+ img_data = pix.tobytes("ppm")
87
+ img = Image.open(io.BytesIO(img_data))
88
+ doc.close()
89
+ return img.convert("RGB")
90
+ except Exception as e:
91
+ raise ValueError(f"Failed to convert PDF with both pdf2image and PyMuPDF. pdf2image error: poppler not found. PyMuPDF error: {str(e)}")
92
+ else:
93
+ raise ValueError(f"Failed to convert PDF to image with all poppler paths. Last error: poppler not found. PyMuPDF not available as fallback.")
94
 
95
  raise ValueError(f"No pages in PDF: {path}")
96
  return Image.open(path).convert("RGB")
requirements.txt CHANGED
@@ -15,3 +15,4 @@ pandas==2.0.3
15
  reportlab==4.0.4
16
  regex==2023.10.3
17
  gradio==4.44.1
 
 
15
  reportlab==4.0.4
16
  regex==2023.10.3
17
  gradio==4.44.1
18
+ PyMuPDF==1.23.8