Spaces:
Running
Running
Commit
·
8dc2d5d
1
Parent(s):
1402288
Adding time
Browse files
app.py
CHANGED
|
@@ -12,6 +12,7 @@ from docling_core.types.doc.document import DocTagsDocument
|
|
| 12 |
from transformers import AutoProcessor, AutoModelForVision2Seq
|
| 13 |
from transformers.image_utils import load_image
|
| 14 |
from pathlib import Path
|
|
|
|
| 15 |
|
| 16 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 17 |
|
|
@@ -36,34 +37,35 @@ def get_pdf_page_count(pdf_path):
|
|
| 36 |
return len(reader.pages)
|
| 37 |
|
| 38 |
def get_page_image(pdf_path, page_num):
|
|
|
|
| 39 |
images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
|
| 40 |
page_image = images[0]
|
| 41 |
-
|
|
|
|
| 42 |
|
| 43 |
def get_docling_ocr(pdf_path, page_num):
|
|
|
|
| 44 |
result = converter.convert(pdf_path, page_range=(page_num, page_num))
|
| 45 |
markdown_text_docling = result.document.export_to_markdown()
|
| 46 |
-
|
|
|
|
| 47 |
|
| 48 |
def get_paddle_ocr(pdf_path, page_num):
|
| 49 |
-
|
| 50 |
-
|
| 51 |
output = pipeline.predict(input=np.array(page_image))
|
| 52 |
-
|
| 53 |
markdown_list = []
|
| 54 |
-
|
| 55 |
for res in output:
|
| 56 |
md_info = res.markdown
|
| 57 |
markdown_list.append(md_info)
|
| 58 |
-
|
| 59 |
markdown_text_paddleOCR = pipeline.concatenate_markdown_pages(markdown_list)
|
| 60 |
-
|
|
|
|
| 61 |
|
| 62 |
def get_smoldocling_ocr(pdf_path, page_num):
|
| 63 |
-
|
|
|
|
| 64 |
image = load_image(page_image)
|
| 65 |
-
|
| 66 |
-
# Create input messages
|
| 67 |
messages = [
|
| 68 |
{
|
| 69 |
"role": "user",
|
|
@@ -73,12 +75,9 @@ def get_smoldocling_ocr(pdf_path, page_num):
|
|
| 73 |
]
|
| 74 |
},
|
| 75 |
]
|
| 76 |
-
|
| 77 |
-
# Prepare inputs
|
| 78 |
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
|
| 79 |
inputs = processor(text=prompt, images=[image], return_tensors="pt")
|
| 80 |
inputs = inputs.to(DEVICE)
|
| 81 |
-
|
| 82 |
generated_ids = model.generate(**inputs, max_new_tokens=8192)
|
| 83 |
prompt_length = inputs.input_ids.shape[1]
|
| 84 |
trimmed_generated_ids = generated_ids[:, prompt_length:]
|
|
@@ -86,13 +85,11 @@ def get_smoldocling_ocr(pdf_path, page_num):
|
|
| 86 |
trimmed_generated_ids,
|
| 87 |
skip_special_tokens=False,
|
| 88 |
)[0].lstrip()
|
| 89 |
-
|
| 90 |
-
# Populate document
|
| 91 |
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
|
| 92 |
doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
|
| 93 |
-
|
| 94 |
markdown_text_smoldocling = doc.export_to_markdown()
|
| 95 |
-
|
|
|
|
| 96 |
|
| 97 |
title = "OCR Arena"
|
| 98 |
description = "A simple Gradio interface to extract text from PDFs and compare OCR models"
|
|
@@ -117,16 +114,20 @@ with gr.Blocks(theme=gr.themes.Glass()) as demo:
|
|
| 117 |
clear_btn = gr.ClearButton(components=[pdf, page_num])
|
| 118 |
submit_btn = gr.Button("Submit", variant='primary')
|
| 119 |
|
| 120 |
-
submit_btn.click(get_page_image, inputs=[pdf, page_num], outputs=original).then(
|
| 121 |
-
get_docling_ocr, inputs=[pdf, page_num], outputs=docling_ocr_out).then(
|
| 122 |
-
get_paddle_ocr, inputs=[pdf, page_num], outputs=paddle_ocr_out).then(
|
| 123 |
-
get_smoldocling_ocr, inputs=[pdf, page_num], outputs=smoldocling_ocr_out)
|
| 124 |
|
| 125 |
with gr.Column():
|
| 126 |
original = gr.Image(width=640, height=640, label="Original Page", interactive=False)
|
|
|
|
| 127 |
docling_ocr_out = gr.Textbox(label="Docling OCR Output", type="text", interactive=False)
|
|
|
|
| 128 |
paddle_ocr_out = gr.Textbox(label="Paddle OCR Output", type="text", interactive=False)
|
|
|
|
| 129 |
smoldocling_ocr_out = gr.Textbox(label="SmolDocling OCR Output", type="text", interactive=False)
|
|
|
|
| 130 |
|
| 131 |
examples_obj = gr.Examples(examples=examples, inputs=[pdf])
|
| 132 |
|
|
|
|
| 12 |
from transformers import AutoProcessor, AutoModelForVision2Seq
|
| 13 |
from transformers.image_utils import load_image
|
| 14 |
from pathlib import Path
|
| 15 |
+
import time
|
| 16 |
|
| 17 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 18 |
|
|
|
|
| 37 |
return len(reader.pages)
|
| 38 |
|
| 39 |
def get_page_image(pdf_path, page_num):
|
| 40 |
+
start = time.time()
|
| 41 |
images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
|
| 42 |
page_image = images[0]
|
| 43 |
+
runtime = time.time() - start
|
| 44 |
+
return page_image, f"{runtime:.2f} s"
|
| 45 |
|
| 46 |
def get_docling_ocr(pdf_path, page_num):
|
| 47 |
+
start = time.time()
|
| 48 |
result = converter.convert(pdf_path, page_range=(page_num, page_num))
|
| 49 |
markdown_text_docling = result.document.export_to_markdown()
|
| 50 |
+
runtime = time.time() - start
|
| 51 |
+
return markdown_text_docling, f"{runtime:.2f} s"
|
| 52 |
|
| 53 |
def get_paddle_ocr(pdf_path, page_num):
|
| 54 |
+
start = time.time()
|
| 55 |
+
page_image = get_page_image(pdf_path, page_num)[0]
|
| 56 |
output = pipeline.predict(input=np.array(page_image))
|
|
|
|
| 57 |
markdown_list = []
|
|
|
|
| 58 |
for res in output:
|
| 59 |
md_info = res.markdown
|
| 60 |
markdown_list.append(md_info)
|
|
|
|
| 61 |
markdown_text_paddleOCR = pipeline.concatenate_markdown_pages(markdown_list)
|
| 62 |
+
runtime = time.time() - start
|
| 63 |
+
return markdown_text_paddleOCR, f"{runtime:.2f} s"
|
| 64 |
|
| 65 |
def get_smoldocling_ocr(pdf_path, page_num):
|
| 66 |
+
start = time.time()
|
| 67 |
+
page_image = get_page_image(pdf_path, page_num)[0]
|
| 68 |
image = load_image(page_image)
|
|
|
|
|
|
|
| 69 |
messages = [
|
| 70 |
{
|
| 71 |
"role": "user",
|
|
|
|
| 75 |
]
|
| 76 |
},
|
| 77 |
]
|
|
|
|
|
|
|
| 78 |
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
|
| 79 |
inputs = processor(text=prompt, images=[image], return_tensors="pt")
|
| 80 |
inputs = inputs.to(DEVICE)
|
|
|
|
| 81 |
generated_ids = model.generate(**inputs, max_new_tokens=8192)
|
| 82 |
prompt_length = inputs.input_ids.shape[1]
|
| 83 |
trimmed_generated_ids = generated_ids[:, prompt_length:]
|
|
|
|
| 85 |
trimmed_generated_ids,
|
| 86 |
skip_special_tokens=False,
|
| 87 |
)[0].lstrip()
|
|
|
|
|
|
|
| 88 |
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
|
| 89 |
doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
|
|
|
|
| 90 |
markdown_text_smoldocling = doc.export_to_markdown()
|
| 91 |
+
runtime = time.time() - start
|
| 92 |
+
return markdown_text_smoldocling, f"{runtime:.2f} s"
|
| 93 |
|
| 94 |
title = "OCR Arena"
|
| 95 |
description = "A simple Gradio interface to extract text from PDFs and compare OCR models"
|
|
|
|
| 114 |
clear_btn = gr.ClearButton(components=[pdf, page_num])
|
| 115 |
submit_btn = gr.Button("Submit", variant='primary')
|
| 116 |
|
| 117 |
+
submit_btn.click(get_page_image, inputs=[pdf, page_num], outputs=[original, original_runtime]).then(
|
| 118 |
+
get_docling_ocr, inputs=[pdf, page_num], outputs=[docling_ocr_out, docling_ocr_runtime]).then(
|
| 119 |
+
get_paddle_ocr, inputs=[pdf, page_num], outputs=[paddle_ocr_out, paddle_ocr_runtime]).then(
|
| 120 |
+
get_smoldocling_ocr, inputs=[pdf, page_num], outputs=[smoldocling_ocr_out, smoldocling_ocr_runtime])
|
| 121 |
|
| 122 |
with gr.Column():
|
| 123 |
original = gr.Image(width=640, height=640, label="Original Page", interactive=False)
|
| 124 |
+
original_runtime = gr.Textbox(label="Image Extraction Time", type="text", interactive=False)
|
| 125 |
docling_ocr_out = gr.Textbox(label="Docling OCR Output", type="text", interactive=False)
|
| 126 |
+
docling_ocr_runtime = gr.Textbox(label="Docling OCR Time", type="text", interactive=False)
|
| 127 |
paddle_ocr_out = gr.Textbox(label="Paddle OCR Output", type="text", interactive=False)
|
| 128 |
+
paddle_ocr_runtime = gr.Textbox(label="Paddle OCR Time", type="text", interactive=False)
|
| 129 |
smoldocling_ocr_out = gr.Textbox(label="SmolDocling OCR Output", type="text", interactive=False)
|
| 130 |
+
smoldocling_ocr_runtime = gr.Textbox(label="SmolDocling OCR Time", type="text", interactive=False)
|
| 131 |
|
| 132 |
examples_obj = gr.Examples(examples=examples, inputs=[pdf])
|
| 133 |
|