|
|
import logging |
|
|
from typing import Any, Dict, Iterable, List, Tuple, Union |
|
|
|
|
|
from docling_core.types.doc import BoundingBox, CoordOrigin |
|
|
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table |
|
|
|
|
|
from docling.datamodel.base_models import OcrCell |
|
|
from docling.datamodel.document import ConversionResult, Page |
|
|
|
|
|
_log = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
def generate_multimodal_pages( |
|
|
doc_result: ConversionResult, |
|
|
) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]: |
|
|
|
|
|
label_to_doclaynet = { |
|
|
"title": "title", |
|
|
"table-of-contents": "document_index", |
|
|
"subtitle-level-1": "section_header", |
|
|
"checkbox-selected": "checkbox_selected", |
|
|
"checkbox-unselected": "checkbox_unselected", |
|
|
"caption": "caption", |
|
|
"page-header": "page_header", |
|
|
"page-footer": "page_footer", |
|
|
"footnote": "footnote", |
|
|
"table": "table", |
|
|
"formula": "formula", |
|
|
"list-item": "list_item", |
|
|
"code": "code", |
|
|
"figure": "picture", |
|
|
"picture": "picture", |
|
|
"reference": "text", |
|
|
"paragraph": "text", |
|
|
"text": "text", |
|
|
} |
|
|
|
|
|
content_text = "" |
|
|
page_no = 0 |
|
|
start_ix = 0 |
|
|
end_ix = 0 |
|
|
doc_items: List[Tuple[int, Union[BaseCell, BaseText]]] = [] |
|
|
|
|
|
doc = doc_result.legacy_document |
|
|
|
|
|
def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page): |
|
|
segments = [] |
|
|
|
|
|
for ix, item in doc_items: |
|
|
item_type = item.obj_type |
|
|
label = label_to_doclaynet.get(item_type, None) |
|
|
|
|
|
if label is None or item.prov is None or page.size is None: |
|
|
continue |
|
|
|
|
|
bbox = BoundingBox.from_tuple( |
|
|
tuple(item.prov[0].bbox), origin=CoordOrigin.BOTTOMLEFT |
|
|
) |
|
|
new_bbox = bbox.to_top_left_origin(page_height=page.size.height).normalized( |
|
|
page_size=page.size |
|
|
) |
|
|
|
|
|
new_segment = { |
|
|
"index_in_doc": ix, |
|
|
"label": label, |
|
|
"text": item.text if item.text is not None else "", |
|
|
"bbox": new_bbox.as_tuple(), |
|
|
"data": [], |
|
|
} |
|
|
|
|
|
if isinstance(item, Table): |
|
|
table_html = item.export_to_html() |
|
|
new_segment["data"].append( |
|
|
{ |
|
|
"html_seq": table_html, |
|
|
"otsl_seq": "", |
|
|
} |
|
|
) |
|
|
|
|
|
segments.append(new_segment) |
|
|
|
|
|
return segments |
|
|
|
|
|
def _process_page_cells(page: Page): |
|
|
cells: List[dict] = [] |
|
|
if page.size is None: |
|
|
return cells |
|
|
for cell in page.cells: |
|
|
new_bbox = cell.bbox.to_top_left_origin( |
|
|
page_height=page.size.height |
|
|
).normalized(page_size=page.size) |
|
|
is_ocr = isinstance(cell, OcrCell) |
|
|
ocr_confidence = cell.confidence if isinstance(cell, OcrCell) else 1.0 |
|
|
cells.append( |
|
|
{ |
|
|
"text": cell.text, |
|
|
"bbox": new_bbox.as_tuple(), |
|
|
"ocr": is_ocr, |
|
|
"ocr_confidence": ocr_confidence, |
|
|
} |
|
|
) |
|
|
return cells |
|
|
|
|
|
def _process_page(): |
|
|
page_ix = page_no - 1 |
|
|
page = doc_result.pages[page_ix] |
|
|
|
|
|
page_cells = _process_page_cells(page=page) |
|
|
page_segments = _process_page_segments(doc_items=doc_items, page=page) |
|
|
content_md = doc.export_to_markdown( |
|
|
main_text_start=start_ix, main_text_stop=end_ix |
|
|
) |
|
|
|
|
|
content_dt = doc.export_to_document_tokens( |
|
|
main_text_start=start_ix, main_text_stop=end_ix, add_page_index=False |
|
|
) |
|
|
|
|
|
return content_text, content_md, content_dt, page_cells, page_segments, page |
|
|
|
|
|
if doc.main_text is None: |
|
|
return |
|
|
for ix, orig_item in enumerate(doc.main_text): |
|
|
|
|
|
item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item |
|
|
if item is None or item.prov is None or len(item.prov) == 0: |
|
|
_log.debug(f"Skipping item {orig_item}") |
|
|
continue |
|
|
|
|
|
item_page = item.prov[0].page |
|
|
|
|
|
|
|
|
if page_no > 0 and item_page > page_no: |
|
|
yield _process_page() |
|
|
|
|
|
start_ix = ix |
|
|
doc_items = [] |
|
|
content_text = "" |
|
|
|
|
|
page_no = item_page |
|
|
end_ix = ix |
|
|
doc_items.append((ix, item)) |
|
|
if item.text is not None and item.text != "": |
|
|
content_text += item.text + " " |
|
|
|
|
|
if len(doc_items) > 0: |
|
|
yield _process_page() |
|
|
|