File size: 4,578 Bytes
88ff397
7af3700
 
88ff397
 
 
20369a1
7af3700
 
 
88ff397
 
20369a1
88ff397
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20369a1
88ff397
20369a1
88ff397
 
 
 
 
 
 
 
 
 
 
20369a1
88ff397
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bff3709
 
88ff397
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import os
import io
import json
from typing import List, Tuple, Dict, Any

import fitz  # PyMuPDF
from PIL import Image
import gradio as gr


# Lazy-load the OCR model to reduce startup time and memory
_ocr_model = None


def get_ocr_model(lang: str = "en"):
    global _ocr_model
    if _ocr_model is not None:
        return _ocr_model

    # PaddleOCR supports language packs like 'en', 'ch', 'fr', 'german', etc.
    # The Spaces container will download the model weights on first run and cache them.
    from paddleocr import PaddleOCR  # import here to avoid heavy import at startup

    _ocr_model = PaddleOCR(use_angle_cls=True, lang=lang, show_log=False)
    return _ocr_model


def pdf_page_to_image(pdf_doc: fitz.Document, page_index: int, dpi: int = 170) -> Image.Image:
    page = pdf_doc.load_page(page_index)
    zoom = dpi / 72.0  # 72 dpi is PDF default
    mat = fitz.Matrix(zoom, zoom)
    pix = page.get_pixmap(matrix=mat, alpha=False)
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    return img


def run_paddle_ocr_on_image(image: Image.Image, lang: str = "en") -> Tuple[str, List[Dict[str, Any]]]:
    ocr = get_ocr_model(lang=lang)
    # Convert PIL image to numpy array for PaddleOCR
    import numpy as np

    img_np = np.array(image)
    result = ocr.ocr(img_np, cls=True)

    lines: List[str] = []
    items: List[Dict[str, Any]] = []

    # PaddleOCR returns list per image: [[(box, (text, conf)), ...]]
    for page_result in result:
        if page_result is None:
            continue
        for det in page_result:
            box = det[0]
            text = det[1][0]
            conf = float(det[1][1])
            lines.append(text)
            items.append({"bbox": box, "text": text, "confidence": conf})

    return "\n".join(lines), items


def extract_text_from_pdf(file_obj, dpi: int = 170, max_pages: int | None = None, lang: str = "en") -> Tuple[str, str]:
    """
    Returns combined text and a JSON string with per-page OCR results.
    """
    if file_obj is None:
        return "", json.dumps({"pages": []}, ensure_ascii=False)

    # Gradio may pass a path or a tempfile.NamedTemporaryFile-like with .name
    pdf_path = file_obj if isinstance(file_obj, str) else getattr(file_obj, "name", None)
    if pdf_path is None or not os.path.exists(pdf_path):
        # If bytes were passed, fall back to reading from buffer
        file_bytes = file_obj.read() if hasattr(file_obj, "read") else None
        if not file_bytes:
            return "", json.dumps({"pages": []}, ensure_ascii=False)
        pdf_doc = fitz.open(stream=file_bytes, filetype="pdf")
    else:
        pdf_doc = fitz.open(pdf_path)

    try:
        num_pages = pdf_doc.page_count
        if max_pages is not None:
            num_pages = min(num_pages, max_pages)

        all_text_lines: List[str] = []
        pages_payload: List[Dict[str, Any]] = []

        for page_index in range(num_pages):
            image = pdf_page_to_image(pdf_doc, page_index, dpi=dpi)
            page_text, page_items = run_paddle_ocr_on_image(image, lang=lang)

            all_text_lines.append(page_text)
            pages_payload.append({
                "page": page_index + 1,
                "items": page_items,
            })

        combined_text = "\n\n".join([t for t in all_text_lines if t])
        json_payload = json.dumps({"pages": pages_payload}, ensure_ascii=False)

        return combined_text, json_payload
    finally:
        pdf_doc.close()


def gradio_predict(pdf_file):
    # Always render at a high DPI for accuracy and use English OCR by default
    text, _ = extract_text_from_pdf(pdf_file, dpi=300, max_pages=None, lang="en")
    return text


with gr.Blocks(title="PDF OCR with PaddleOCR + PyMuPDF") as demo:
    gr.Markdown("""
    # PDF OCR (PaddleOCR + PyMuPDF)
    Upload a PDF to extract text using OCR. The app renders pages with PyMuPDF at a high DPI and uses PaddleOCR for recognition.
    """)

    pdf_input = gr.File(label="PDF", file_types=[".pdf"], file_count="single")
    text_output = gr.Textbox(label="Extracted Text", lines=20)

    # Auto-run OCR when a PDF is uploaded
    pdf_input.change(fn=gradio_predict, inputs=[pdf_input], outputs=[text_output], api_name="predict")

    # Simple API note
    gr.Markdown("""
    ## API usage
    - Use `gradio_client` to call this Space. Function signature: `gradio_predict(pdf_file)` → `text`.
    """)


if __name__ == "__main__":
    # On Spaces, the host/port are managed by the platform. Locally, this runs on 7860 by default.
    demo.launch()