Yaz Hobooti
commited on
Commit
·
b89e133
1
Parent(s):
cd4f997
Add PyMuPDF fallback and comprehensive poppler packages for PDF processing
Browse files- apt.txt +3 -0
- pdf_comparator.py +27 -3
- requirements.txt +1 -0
apt.txt
CHANGED
|
@@ -2,5 +2,8 @@ poppler-utils
|
|
| 2 |
poppler-data
|
| 3 |
libpoppler-dev
|
| 4 |
libpoppler-cpp-dev
|
|
|
|
|
|
|
| 5 |
tesseract-ocr
|
| 6 |
libzbar0
|
|
|
|
|
|
| 2 |
poppler-data
|
| 3 |
libpoppler-dev
|
| 4 |
libpoppler-cpp-dev
|
| 5 |
+
libpoppler-glib-dev
|
| 6 |
+
libpoppler-qt5-dev
|
| 7 |
tesseract-ocr
|
| 8 |
libzbar0
|
| 9 |
+
ghostscript
|
pdf_comparator.py
CHANGED
|
@@ -16,6 +16,14 @@ from skimage.measure import label, regionprops
|
|
| 16 |
from skimage.morphology import dilation, rectangle
|
| 17 |
import gradio as gr
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
# Optional features
|
| 20 |
try:
|
| 21 |
import pytesseract
|
|
@@ -49,7 +57,7 @@ def _is_pdf(path: str) -> bool:
|
|
| 49 |
|
| 50 |
def load_first_page(path: str, dpi: int = 300) -> Image.Image:
|
| 51 |
if _is_pdf(path):
|
| 52 |
-
# Try multiple poppler paths
|
| 53 |
poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None]
|
| 54 |
|
| 55 |
for poppler_path in poppler_paths:
|
|
@@ -64,9 +72,25 @@ def load_first_page(path: str, dpi: int = 300) -> Image.Image:
|
|
| 64 |
|
| 65 |
return imgs[0].convert("RGB")
|
| 66 |
except Exception as e:
|
| 67 |
-
if poppler_path is None: #
|
| 68 |
-
|
| 69 |
continue # Try next path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
raise ValueError(f"No pages in PDF: {path}")
|
| 72 |
return Image.open(path).convert("RGB")
|
|
|
|
| 16 |
from skimage.morphology import dilation, rectangle
|
| 17 |
import gradio as gr
|
| 18 |
|
| 19 |
+
# Alternative PDF processing
|
| 20 |
+
try:
|
| 21 |
+
import fitz # PyMuPDF
|
| 22 |
+
HAS_PYMUPDF = True
|
| 23 |
+
except Exception:
|
| 24 |
+
fitz = None
|
| 25 |
+
HAS_PYMUPDF = False
|
| 26 |
+
|
| 27 |
# Optional features
|
| 28 |
try:
|
| 29 |
import pytesseract
|
|
|
|
| 57 |
|
| 58 |
def load_first_page(path: str, dpi: int = 300) -> Image.Image:
|
| 59 |
if _is_pdf(path):
|
| 60 |
+
# Try pdf2image with multiple poppler paths first
|
| 61 |
poppler_paths = ["/usr/bin", "/usr/local/bin", "/bin", None]
|
| 62 |
|
| 63 |
for poppler_path in poppler_paths:
|
|
|
|
| 72 |
|
| 73 |
return imgs[0].convert("RGB")
|
| 74 |
except Exception as e:
|
| 75 |
+
if poppler_path is None: # All pdf2image attempts failed
|
| 76 |
+
break
|
| 77 |
continue # Try next path
|
| 78 |
+
|
| 79 |
+
# Fallback to PyMuPDF if pdf2image fails
|
| 80 |
+
if HAS_PYMUPDF:
|
| 81 |
+
try:
|
| 82 |
+
doc = fitz.open(path)
|
| 83 |
+
page = doc[0] # First page
|
| 84 |
+
mat = fitz.Matrix(dpi/72, dpi/72) # Scale factor for DPI
|
| 85 |
+
pix = page.get_pixmap(matrix=mat)
|
| 86 |
+
img_data = pix.tobytes("ppm")
|
| 87 |
+
img = Image.open(io.BytesIO(img_data))
|
| 88 |
+
doc.close()
|
| 89 |
+
return img.convert("RGB")
|
| 90 |
+
except Exception as e:
|
| 91 |
+
raise ValueError(f"Failed to convert PDF with both pdf2image and PyMuPDF. pdf2image error: poppler not found. PyMuPDF error: {str(e)}")
|
| 92 |
+
else:
|
| 93 |
+
raise ValueError(f"Failed to convert PDF to image with all poppler paths. Last error: poppler not found. PyMuPDF not available as fallback.")
|
| 94 |
|
| 95 |
raise ValueError(f"No pages in PDF: {path}")
|
| 96 |
return Image.open(path).convert("RGB")
|
requirements.txt
CHANGED
|
@@ -15,3 +15,4 @@ pandas==2.0.3
|
|
| 15 |
reportlab==4.0.4
|
| 16 |
regex==2023.10.3
|
| 17 |
gradio==4.44.1
|
|
|
|
|
|
| 15 |
reportlab==4.0.4
|
| 16 |
regex==2023.10.3
|
| 17 |
gradio==4.44.1
|
| 18 |
+
PyMuPDF==1.23.8
|