| import logging | |
| from pathlib import Path | |
| import sycamore | |
| from sycamore import ExecMode | |
| from sycamore.data import Document | |
| from sycamore.data.document import DocumentPropertyTypes | |
| from sycamore.functions.document import DrawBoxes, split_and_convert_to_image | |
| from sycamore.transforms.partition import ArynPartitioner | |
| from sycamore.utils.markdown import elements_to_markdown | |
| from .settings import ENABLE_DEBUG_MODE | |
| logging.getLogger().setLevel(logging.INFO) | |
| SYCAMORE_DEBUG_PATH = Path("/tmp/sycamore") | |
| SYCAMORE_DEBUG_PATH.mkdir(exist_ok=True) | |
| paritioner = ArynPartitioner( | |
| use_partitioning_service=False, | |
| extract_table_structure=True, | |
| use_ocr=True, | |
| extract_images=True, | |
| device="cpu", | |
| ) | |
| context = sycamore.init( | |
| exec_mode=ExecMode.LOCAL, | |
| ) | |
| def image_page_filename_fn(doc: Document) -> str: | |
| page_num = doc.properties[DocumentPropertyTypes.PAGE_NUMBER] | |
| return f"page_{page_num}.png" | |
| def convert_sycamore(path: str, file_name: str): | |
| docset = context.read.binary(paths=path, binary_format="pdf").partition( | |
| partitioner=paritioner, | |
| ) | |
| debug_path = SYCAMORE_DEBUG_PATH / file_name | |
| debug_path.mkdir(exist_ok=True) | |
| image_paths = [] | |
| doc = docset.take_all()[0] | |
| md = elements_to_markdown(doc.elements) | |
| if ENABLE_DEBUG_MODE: | |
| docset.flat_map(split_and_convert_to_image).map_batch( | |
| DrawBoxes, f_constructor_kwargs={"draw_table_cells": True} | |
| ).write.files(str(debug_path), filename_fn=image_page_filename_fn) | |
| image_paths = [str(path) for path in debug_path.glob("*.png")] | |
| return md, image_paths | |