File size: 4,578 Bytes
88ff397 7af3700 88ff397 20369a1 7af3700 88ff397 20369a1 88ff397 20369a1 88ff397 20369a1 88ff397 20369a1 88ff397 bff3709 88ff397 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import os
import io
import json
from typing import List, Tuple, Dict, Any
import fitz # PyMuPDF
from PIL import Image
import gradio as gr
# Lazy-load the OCR model to reduce startup time and memory
_ocr_model = None
def get_ocr_model(lang: str = "en"):
global _ocr_model
if _ocr_model is not None:
return _ocr_model
# PaddleOCR supports language packs like 'en', 'ch', 'fr', 'german', etc.
# The Spaces container will download the model weights on first run and cache them.
from paddleocr import PaddleOCR # import here to avoid heavy import at startup
_ocr_model = PaddleOCR(use_angle_cls=True, lang=lang, show_log=False)
return _ocr_model
def pdf_page_to_image(pdf_doc: fitz.Document, page_index: int, dpi: int = 170) -> Image.Image:
page = pdf_doc.load_page(page_index)
zoom = dpi / 72.0 # 72 dpi is PDF default
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat, alpha=False)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
return img
def run_paddle_ocr_on_image(image: Image.Image, lang: str = "en") -> Tuple[str, List[Dict[str, Any]]]:
ocr = get_ocr_model(lang=lang)
# Convert PIL image to numpy array for PaddleOCR
import numpy as np
img_np = np.array(image)
result = ocr.ocr(img_np, cls=True)
lines: List[str] = []
items: List[Dict[str, Any]] = []
# PaddleOCR returns list per image: [[(box, (text, conf)), ...]]
for page_result in result:
if page_result is None:
continue
for det in page_result:
box = det[0]
text = det[1][0]
conf = float(det[1][1])
lines.append(text)
items.append({"bbox": box, "text": text, "confidence": conf})
return "\n".join(lines), items
def extract_text_from_pdf(file_obj, dpi: int = 170, max_pages: int | None = None, lang: str = "en") -> Tuple[str, str]:
"""
Returns combined text and a JSON string with per-page OCR results.
"""
if file_obj is None:
return "", json.dumps({"pages": []}, ensure_ascii=False)
# Gradio may pass a path or a tempfile.NamedTemporaryFile-like with .name
pdf_path = file_obj if isinstance(file_obj, str) else getattr(file_obj, "name", None)
if pdf_path is None or not os.path.exists(pdf_path):
# If bytes were passed, fall back to reading from buffer
file_bytes = file_obj.read() if hasattr(file_obj, "read") else None
if not file_bytes:
return "", json.dumps({"pages": []}, ensure_ascii=False)
pdf_doc = fitz.open(stream=file_bytes, filetype="pdf")
else:
pdf_doc = fitz.open(pdf_path)
try:
num_pages = pdf_doc.page_count
if max_pages is not None:
num_pages = min(num_pages, max_pages)
all_text_lines: List[str] = []
pages_payload: List[Dict[str, Any]] = []
for page_index in range(num_pages):
image = pdf_page_to_image(pdf_doc, page_index, dpi=dpi)
page_text, page_items = run_paddle_ocr_on_image(image, lang=lang)
all_text_lines.append(page_text)
pages_payload.append({
"page": page_index + 1,
"items": page_items,
})
combined_text = "\n\n".join([t for t in all_text_lines if t])
json_payload = json.dumps({"pages": pages_payload}, ensure_ascii=False)
return combined_text, json_payload
finally:
pdf_doc.close()
def gradio_predict(pdf_file):
# Always render at a high DPI for accuracy and use English OCR by default
text, _ = extract_text_from_pdf(pdf_file, dpi=300, max_pages=None, lang="en")
return text
with gr.Blocks(title="PDF OCR with PaddleOCR + PyMuPDF") as demo:
gr.Markdown("""
# PDF OCR (PaddleOCR + PyMuPDF)
Upload a PDF to extract text using OCR. The app renders pages with PyMuPDF at a high DPI and uses PaddleOCR for recognition.
""")
pdf_input = gr.File(label="PDF", file_types=[".pdf"], file_count="single")
text_output = gr.Textbox(label="Extracted Text", lines=20)
# Auto-run OCR when a PDF is uploaded
pdf_input.change(fn=gradio_predict, inputs=[pdf_input], outputs=[text_output], api_name="predict")
# Simple API note
gr.Markdown("""
## API usage
- Use `gradio_client` to call this Space. Function signature: `gradio_predict(pdf_file)` → `text`.
""")
if __name__ == "__main__":
# On Spaces, the host/port are managed by the platform. Locally, this runs on 7860 by default.
demo.launch()
|