Spaces:
Running
Running
Commit
·
1402288
1
Parent(s):
f449995
Execution changes
Browse files
app.py
CHANGED
|
@@ -35,12 +35,19 @@ def get_pdf_page_count(pdf_path):
|
|
| 35 |
reader = PdfReader(pdf_path)
|
| 36 |
return len(reader.pages)
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
def get_docling_ocr(pdf_path, page_num):
|
| 39 |
result = converter.convert(pdf_path, page_range=(page_num, page_num))
|
| 40 |
markdown_text_docling = result.document.export_to_markdown()
|
| 41 |
return markdown_text_docling
|
| 42 |
|
| 43 |
-
def get_paddle_ocr(
|
|
|
|
|
|
|
| 44 |
output = pipeline.predict(input=np.array(page_image))
|
| 45 |
|
| 46 |
markdown_list = []
|
|
@@ -52,7 +59,8 @@ def get_paddle_ocr(page_image):
|
|
| 52 |
markdown_text_paddleOCR = pipeline.concatenate_markdown_pages(markdown_list)
|
| 53 |
return markdown_text_paddleOCR
|
| 54 |
|
| 55 |
-
def get_smoldocling_ocr(
|
|
|
|
| 56 |
image = load_image(page_image)
|
| 57 |
|
| 58 |
# Create input messages
|
|
@@ -85,16 +93,6 @@ def get_smoldocling_ocr(page_image):
|
|
| 85 |
|
| 86 |
markdown_text_smoldocling = doc.export_to_markdown()
|
| 87 |
return markdown_text_smoldocling
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
def inference(pdf_path, page_num):
|
| 91 |
-
docling_ocr = get_docling_ocr(pdf_path, page_num)
|
| 92 |
-
# Extract the first page as an image
|
| 93 |
-
images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
|
| 94 |
-
page_image = images[0]
|
| 95 |
-
paddle_ocr = get_paddle_ocr(page_image)
|
| 96 |
-
smoldocling_ocr = get_smoldocling_ocr(page_image)
|
| 97 |
-
return page_image, docling_ocr, paddle_ocr, smoldocling_ocr
|
| 98 |
|
| 99 |
title = "OCR Arena"
|
| 100 |
description = "A simple Gradio interface to extract text from PDFs and compare OCR models"
|
|
@@ -102,6 +100,7 @@ examples = [["data/amazon-10-k-2024.pdf"],
|
|
| 102 |
["data/goog-10-k-2023.pdf"]]
|
| 103 |
|
| 104 |
with gr.Blocks(theme=gr.themes.Glass()) as demo:
|
|
|
|
| 105 |
with gr.Row():
|
| 106 |
with gr.Column():
|
| 107 |
pdf = gr.File(label="Input PDFs", file_types=[".pdf"])
|
|
@@ -118,7 +117,10 @@ with gr.Blocks(theme=gr.themes.Glass()) as demo:
|
|
| 118 |
clear_btn = gr.ClearButton(components=[pdf, page_num])
|
| 119 |
submit_btn = gr.Button("Submit", variant='primary')
|
| 120 |
|
| 121 |
-
submit_btn.click(
|
|
|
|
|
|
|
|
|
|
| 122 |
|
| 123 |
with gr.Column():
|
| 124 |
original = gr.Image(width=640, height=640, label="Original Page", interactive=False)
|
|
|
|
| 35 |
reader = PdfReader(pdf_path)
|
| 36 |
return len(reader.pages)
|
| 37 |
|
| 38 |
+
def get_page_image(pdf_path, page_num):
|
| 39 |
+
images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
|
| 40 |
+
page_image = images[0]
|
| 41 |
+
return page_image
|
| 42 |
+
|
| 43 |
def get_docling_ocr(pdf_path, page_num):
|
| 44 |
result = converter.convert(pdf_path, page_range=(page_num, page_num))
|
| 45 |
markdown_text_docling = result.document.export_to_markdown()
|
| 46 |
return markdown_text_docling
|
| 47 |
|
| 48 |
+
def get_paddle_ocr(pdf_path, page_num):
|
| 49 |
+
page_image = get_page_image(pdf_path, page_num)
|
| 50 |
+
|
| 51 |
output = pipeline.predict(input=np.array(page_image))
|
| 52 |
|
| 53 |
markdown_list = []
|
|
|
|
| 59 |
markdown_text_paddleOCR = pipeline.concatenate_markdown_pages(markdown_list)
|
| 60 |
return markdown_text_paddleOCR
|
| 61 |
|
| 62 |
+
def get_smoldocling_ocr(pdf_path, page_num):
|
| 63 |
+
page_image = get_page_image(pdf_path, page_num)
|
| 64 |
image = load_image(page_image)
|
| 65 |
|
| 66 |
# Create input messages
|
|
|
|
| 93 |
|
| 94 |
markdown_text_smoldocling = doc.export_to_markdown()
|
| 95 |
return markdown_text_smoldocling
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
title = "OCR Arena"
|
| 98 |
description = "A simple Gradio interface to extract text from PDFs and compare OCR models"
|
|
|
|
| 100 |
["data/goog-10-k-2023.pdf"]]
|
| 101 |
|
| 102 |
with gr.Blocks(theme=gr.themes.Glass()) as demo:
|
| 103 |
+
gr.Markdown(f"# {title}\n{description}")
|
| 104 |
with gr.Row():
|
| 105 |
with gr.Column():
|
| 106 |
pdf = gr.File(label="Input PDFs", file_types=[".pdf"])
|
|
|
|
| 117 |
clear_btn = gr.ClearButton(components=[pdf, page_num])
|
| 118 |
submit_btn = gr.Button("Submit", variant='primary')
|
| 119 |
|
| 120 |
+
submit_btn.click(get_page_image, inputs=[pdf, page_num], outputs=original).then(
|
| 121 |
+
get_docling_ocr, inputs=[pdf, page_num], outputs=docling_ocr_out).then(
|
| 122 |
+
get_paddle_ocr, inputs=[pdf, page_num], outputs=paddle_ocr_out).then(
|
| 123 |
+
get_smoldocling_ocr, inputs=[pdf, page_num], outputs=smoldocling_ocr_out)
|
| 124 |
|
| 125 |
with gr.Column():
|
| 126 |
original = gr.Image(width=640, height=640, label="Original Page", interactive=False)
|