Spaces:

AkashDataScience
/

OCRArena

Running

App Files Files Community

OCRArena / app.py

AkashDataScience

Adding PaddleOCR

03a594f 5 months ago

raw

history blame

2.82 kB

	from PyPDF2 import PdfReader
	import gradio as gr
	from docling.document_converter import DocumentConverter, PdfFormatOption
	from docling.datamodel.pipeline_options import PdfPipelineOptions
	from docling.datamodel.base_models import InputFormat
	from paddleocr import PPStructureV3
	from pdf2image import convert_from_path
	import numpy as np

	pipeline_options = PdfPipelineOptions(enable_remote_services=True)
	converter = DocumentConverter(
	format_options={
	InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
	}
	)

	def get_pdf_page_count(pdf_path):
	reader = PdfReader(pdf_path)
	return len(reader.pages)

	def get_docling_ocr(pdf_path, page_num):
	result = converter.convert(pdf_path, page_range=(page_num, page_num))
	markdown_text_docling = result.document.export_to_markdown()
	return markdown_text_docling

	def get_paddle_ocr(page_image):
	pipeline = PPStructureV3()
	output = pipeline.predict(input=np.array(page_image))

	markdown_list = []

	for res in output:
	md_info = res.markdown
	markdown_list.append(md_info)

	markdown_text_paddleOCR = pipeline.concatenate_markdown_pages(markdown_list)
	return markdown_text_paddleOCR

	def inference(pdf_path, page_num):
	docling_ocr = get_docling_ocr(pdf_path, page_num)
	# Extract the first page as an image
	images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
	page_image = images[0]
	paddle_ocr = get_paddle_ocr(page_image)
	return docling_ocr, paddle_ocr

	title = "OCR Arena"
	description = "A simple Gradio interface to extract text from PDFs and compare OCR models"
	examples = [["data/amazon-10-k-2024.pdf"],
	["data/goog-10-k-2023.pdf"]]

	with gr.Blocks(theme=gr.themes.Glass()) as demo:
	with gr.Row():
	with gr.Column():
	pdf = gr.File(label="Input PDFs", file_types=[".pdf"])

	@gr.render(inputs=pdf)
	def show_slider(pdf_path):
	if pdf_path is None:
	page_num = gr.Markdown("## No Input Provided")
	else:
	page_count = get_pdf_page_count(pdf_path)
	page_num = gr.Slider(1, page_count, value=1, step=1, label="Page Number")

	with gr.Row():
	clear_btn = gr.ClearButton(components=[pdf, page_num])
	submit_btn = gr.Button("Submit", variant='primary')

	submit_btn.click(inference, inputs=[pdf, page_num], outputs=[docling_ocr_out, paddle_ocr_out])

	with gr.Column():
	docling_ocr_out = gr.Textbox(label="Docling OCR Output", type="text")
	paddle_ocr_out = gr.Textbox(label="Paddle OCR Output", type="text")

	examples_obj = gr.Examples(examples=examples, inputs=[pdf])

	demo.launch()