Spaces:
Running
Running
| from PyPDF2 import PdfReader | |
| import gradio as gr | |
| from docling.document_converter import DocumentConverter, PdfFormatOption | |
| from docling.datamodel.pipeline_options import PdfPipelineOptions | |
| from docling.datamodel.base_models import InputFormat | |
| from paddleocr import PPStructureV3 | |
| from pdf2image import convert_from_path | |
| import numpy as np | |
| pipeline_options = PdfPipelineOptions(enable_remote_services=True) | |
| converter = DocumentConverter( | |
| format_options={ | |
| InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) | |
| } | |
| ) | |
| def get_pdf_page_count(pdf_path): | |
| reader = PdfReader(pdf_path) | |
| return len(reader.pages) | |
| def get_docling_ocr(pdf_path, page_num): | |
| result = converter.convert(pdf_path, page_range=(page_num, page_num)) | |
| markdown_text_docling = result.document.export_to_markdown() | |
| return markdown_text_docling | |
| def get_paddle_ocr(page_image): | |
| pipeline = PPStructureV3() | |
| output = pipeline.predict(input=np.array(page_image)) | |
| markdown_list = [] | |
| for res in output: | |
| md_info = res.markdown | |
| markdown_list.append(md_info) | |
| markdown_text_paddleOCR = pipeline.concatenate_markdown_pages(markdown_list) | |
| return markdown_text_paddleOCR | |
| def inference(pdf_path, page_num): | |
| docling_ocr = get_docling_ocr(pdf_path, page_num) | |
| # Extract the first page as an image | |
| images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num) | |
| page_image = images[0] | |
| paddle_ocr = get_paddle_ocr(page_image) | |
| return docling_ocr, paddle_ocr | |
| title = "OCR Arena" | |
| description = "A simple Gradio interface to extract text from PDFs and compare OCR models" | |
| examples = [["data/amazon-10-k-2024.pdf"], | |
| ["data/goog-10-k-2023.pdf"]] | |
| with gr.Blocks(theme=gr.themes.Glass()) as demo: | |
| with gr.Row(): | |
| with gr.Column(): | |
| pdf = gr.File(label="Input PDFs", file_types=[".pdf"]) | |
| def show_slider(pdf_path): | |
| if pdf_path is None: | |
| page_num = gr.Markdown("## No Input Provided") | |
| else: | |
| page_count = get_pdf_page_count(pdf_path) | |
| page_num = gr.Slider(1, page_count, value=1, step=1, label="Page Number") | |
| with gr.Row(): | |
| clear_btn = gr.ClearButton(components=[pdf, page_num]) | |
| submit_btn = gr.Button("Submit", variant='primary') | |
| submit_btn.click(inference, inputs=[pdf, page_num], outputs=[docling_ocr_out, paddle_ocr_out]) | |
| with gr.Column(): | |
| docling_ocr_out = gr.Textbox(label="Docling OCR Output", type="text") | |
| paddle_ocr_out = gr.Textbox(label="Paddle OCR Output", type="text") | |
| examples_obj = gr.Examples(examples=examples, inputs=[pdf]) | |
| demo.launch() | |