|
|
import logging |
|
|
from io import BytesIO |
|
|
from pathlib import Path |
|
|
from typing import Set, Union |
|
|
|
|
|
from docling_core.types.doc import ( |
|
|
BoundingBox, |
|
|
CoordOrigin, |
|
|
DocItemLabel, |
|
|
DoclingDocument, |
|
|
DocumentOrigin, |
|
|
GroupLabel, |
|
|
ImageRef, |
|
|
ProvenanceItem, |
|
|
Size, |
|
|
TableCell, |
|
|
TableData, |
|
|
) |
|
|
from PIL import Image, UnidentifiedImageError |
|
|
from pptx import Presentation |
|
|
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER |
|
|
|
|
|
from docling.backend.abstract_backend import ( |
|
|
DeclarativeDocumentBackend, |
|
|
PaginatedDocumentBackend, |
|
|
) |
|
|
from docling.datamodel.base_models import InputFormat |
|
|
from docling.datamodel.document import InputDocument |
|
|
|
|
|
_log = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend): |
|
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): |
|
|
super().__init__(in_doc, path_or_stream) |
|
|
self.namespaces = { |
|
|
"a": "http://schemas.openxmlformats.org/drawingml/2006/main", |
|
|
"c": "http://schemas.openxmlformats.org/drawingml/2006/chart", |
|
|
"p": "http://schemas.openxmlformats.org/presentationml/2006/main", |
|
|
} |
|
|
|
|
|
self.path_or_stream = path_or_stream |
|
|
|
|
|
self.pptx_obj = None |
|
|
self.valid = False |
|
|
try: |
|
|
if isinstance(self.path_or_stream, BytesIO): |
|
|
self.pptx_obj = Presentation(self.path_or_stream) |
|
|
elif isinstance(self.path_or_stream, Path): |
|
|
self.pptx_obj = Presentation(str(self.path_or_stream)) |
|
|
|
|
|
self.valid = True |
|
|
except Exception as e: |
|
|
raise RuntimeError( |
|
|
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}" |
|
|
) from e |
|
|
|
|
|
return |
|
|
|
|
|
def page_count(self) -> int: |
|
|
if self.is_valid(): |
|
|
assert self.pptx_obj is not None |
|
|
return len(self.pptx_obj.slides) |
|
|
else: |
|
|
return 0 |
|
|
|
|
|
def is_valid(self) -> bool: |
|
|
return self.valid |
|
|
|
|
|
@classmethod |
|
|
def supports_pagination(cls) -> bool: |
|
|
return True |
|
|
|
|
|
def unload(self): |
|
|
if isinstance(self.path_or_stream, BytesIO): |
|
|
self.path_or_stream.close() |
|
|
|
|
|
self.path_or_stream = None |
|
|
|
|
|
@classmethod |
|
|
def supported_formats(cls) -> Set[InputFormat]: |
|
|
return {InputFormat.PPTX} |
|
|
|
|
|
def convert(self) -> DoclingDocument: |
|
|
|
|
|
|
|
|
|
|
|
origin = DocumentOrigin( |
|
|
filename=self.file.name or "file", |
|
|
mimetype="application/vnd.ms-powerpoint", |
|
|
binary_hash=self.document_hash, |
|
|
) |
|
|
|
|
|
doc = DoclingDocument( |
|
|
name=self.file.stem or "file", origin=origin |
|
|
) |
|
|
doc = self.walk_linear(self.pptx_obj, doc) |
|
|
|
|
|
return doc |
|
|
|
|
|
def generate_prov( |
|
|
self, shape, slide_ind, text="", slide_size=Size(width=1, height=1) |
|
|
): |
|
|
if shape.left: |
|
|
left = shape.left |
|
|
top = shape.top |
|
|
width = shape.width |
|
|
height = shape.height |
|
|
else: |
|
|
left = 0 |
|
|
top = 0 |
|
|
width = slide_size.width |
|
|
height = slide_size.height |
|
|
shape_bbox = [left, top, left + width, top + height] |
|
|
shape_bbox = BoundingBox.from_tuple(shape_bbox, origin=CoordOrigin.BOTTOMLEFT) |
|
|
prov = ProvenanceItem( |
|
|
page_no=slide_ind + 1, charspan=[0, len(text)], bbox=shape_bbox |
|
|
) |
|
|
|
|
|
return prov |
|
|
|
|
|
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size): |
|
|
is_a_list = False |
|
|
is_list_group_created = False |
|
|
enum_list_item_value = 0 |
|
|
new_list = None |
|
|
bullet_type = "None" |
|
|
list_text = "" |
|
|
list_label = GroupLabel.LIST |
|
|
doc_label = DocItemLabel.LIST_ITEM |
|
|
prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size) |
|
|
|
|
|
|
|
|
for paragraph in shape.text_frame.paragraphs: |
|
|
|
|
|
p = paragraph._element |
|
|
if ( |
|
|
p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]}) |
|
|
is not None |
|
|
): |
|
|
bullet_type = "Bullet" |
|
|
is_a_list = True |
|
|
elif ( |
|
|
p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]}) |
|
|
is not None |
|
|
): |
|
|
bullet_type = "Numbered" |
|
|
is_a_list = True |
|
|
else: |
|
|
is_a_list = False |
|
|
|
|
|
if paragraph.level > 0: |
|
|
|
|
|
is_a_list = True |
|
|
|
|
|
if is_a_list: |
|
|
|
|
|
|
|
|
if bullet_type == "Numbered": |
|
|
list_label = GroupLabel.ORDERED_LIST |
|
|
|
|
|
if is_a_list: |
|
|
_log.debug("LIST DETECTED!") |
|
|
else: |
|
|
_log.debug("No List") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for paragraph in shape.text_frame.paragraphs: |
|
|
|
|
|
p = paragraph._element |
|
|
enum_list_item_value += 1 |
|
|
inline_paragraph_text = "" |
|
|
inline_list_item_text = "" |
|
|
|
|
|
for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}): |
|
|
if len(e.text.strip()) > 0: |
|
|
e_is_a_list_item = False |
|
|
is_numbered = False |
|
|
if ( |
|
|
p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]}) |
|
|
is not None |
|
|
): |
|
|
bullet_type = "Bullet" |
|
|
e_is_a_list_item = True |
|
|
elif ( |
|
|
p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]}) |
|
|
is not None |
|
|
): |
|
|
bullet_type = "Numbered" |
|
|
is_numbered = True |
|
|
e_is_a_list_item = True |
|
|
else: |
|
|
e_is_a_list_item = False |
|
|
|
|
|
if e_is_a_list_item: |
|
|
if len(inline_paragraph_text) > 0: |
|
|
|
|
|
doc.add_text( |
|
|
label=doc_label, |
|
|
parent=parent_slide, |
|
|
text=inline_paragraph_text, |
|
|
prov=prov, |
|
|
) |
|
|
|
|
|
inline_list_item_text += e.text |
|
|
|
|
|
else: |
|
|
|
|
|
|
|
|
doc_label = DocItemLabel.PARAGRAPH |
|
|
if shape.is_placeholder: |
|
|
placeholder_type = shape.placeholder_format.type |
|
|
if placeholder_type in [ |
|
|
PP_PLACEHOLDER.CENTER_TITLE, |
|
|
PP_PLACEHOLDER.TITLE, |
|
|
]: |
|
|
|
|
|
doc_label = DocItemLabel.TITLE |
|
|
elif placeholder_type == PP_PLACEHOLDER.SUBTITLE: |
|
|
DocItemLabel.SECTION_HEADER |
|
|
enum_list_item_value = 0 |
|
|
inline_paragraph_text += e.text |
|
|
|
|
|
if len(inline_paragraph_text) > 0: |
|
|
|
|
|
doc.add_text( |
|
|
label=doc_label, |
|
|
parent=parent_slide, |
|
|
text=inline_paragraph_text, |
|
|
prov=prov, |
|
|
) |
|
|
|
|
|
if len(inline_list_item_text) > 0: |
|
|
enum_marker = "" |
|
|
if is_numbered: |
|
|
enum_marker = str(enum_list_item_value) + "." |
|
|
if not is_list_group_created: |
|
|
new_list = doc.add_group( |
|
|
label=list_label, name=f"list", parent=parent_slide |
|
|
) |
|
|
is_list_group_created = True |
|
|
doc.add_list_item( |
|
|
marker=enum_marker, |
|
|
enumerated=is_numbered, |
|
|
parent=new_list, |
|
|
text=inline_list_item_text, |
|
|
prov=prov, |
|
|
) |
|
|
return |
|
|
|
|
|
def handle_title(self, shape, parent_slide, slide_ind, doc): |
|
|
placeholder_type = shape.placeholder_format.type |
|
|
txt = shape.text.strip() |
|
|
prov = self.generate_prov(shape, slide_ind, txt) |
|
|
|
|
|
if len(txt.strip()) > 0: |
|
|
|
|
|
if placeholder_type in [PP_PLACEHOLDER.CENTER_TITLE, PP_PLACEHOLDER.TITLE]: |
|
|
_log.info(f"Title found: {shape.text}") |
|
|
doc.add_text( |
|
|
label=DocItemLabel.TITLE, parent=parent_slide, text=txt, prov=prov |
|
|
) |
|
|
elif placeholder_type == PP_PLACEHOLDER.SUBTITLE: |
|
|
_log.info(f"Subtitle found: {shape.text}") |
|
|
|
|
|
doc.add_text( |
|
|
label=DocItemLabel.SECTION_HEADER, |
|
|
parent=parent_slide, |
|
|
text=txt, |
|
|
prov=prov, |
|
|
) |
|
|
return |
|
|
|
|
|
def handle_pictures(self, shape, parent_slide, slide_ind, doc, slide_size): |
|
|
|
|
|
try: |
|
|
|
|
|
image = shape.image |
|
|
image_bytes = image.blob |
|
|
im_dpi, _ = image.dpi |
|
|
pil_image = Image.open(BytesIO(image_bytes)) |
|
|
|
|
|
|
|
|
prov = self.generate_prov(shape, slide_ind, "", slide_size) |
|
|
doc.add_picture( |
|
|
parent=parent_slide, |
|
|
image=ImageRef.from_pil(image=pil_image, dpi=im_dpi), |
|
|
caption=None, |
|
|
prov=prov, |
|
|
) |
|
|
except (UnidentifiedImageError, OSError) as e: |
|
|
_log.warning(f"Warning: image cannot be loaded by Pillow: {e}") |
|
|
return |
|
|
|
|
|
def handle_tables(self, shape, parent_slide, slide_ind, doc, slide_size): |
|
|
|
|
|
if shape.has_table: |
|
|
table = shape.table |
|
|
table_xml = shape._element |
|
|
|
|
|
prov = self.generate_prov(shape, slide_ind, "", slide_size) |
|
|
|
|
|
num_cols = 0 |
|
|
num_rows = len(table.rows) |
|
|
tcells = [] |
|
|
|
|
|
table_xml = shape._element |
|
|
|
|
|
for row_idx, row in enumerate(table.rows): |
|
|
if len(row.cells) > num_cols: |
|
|
num_cols = len(row.cells) |
|
|
for col_idx, cell in enumerate(row.cells): |
|
|
|
|
|
cell_xml = table_xml.xpath( |
|
|
f".//a:tbl/a:tr[{row_idx + 1}]/a:tc[{col_idx + 1}]" |
|
|
) |
|
|
|
|
|
if not cell_xml: |
|
|
continue |
|
|
|
|
|
cell_xml = cell_xml[0] |
|
|
row_span = cell_xml.get("rowSpan") |
|
|
col_span = cell_xml.get("gridSpan") |
|
|
|
|
|
if row_span is None: |
|
|
row_span = 1 |
|
|
else: |
|
|
row_span = int(row_span) |
|
|
|
|
|
if col_span is None: |
|
|
col_span = 1 |
|
|
else: |
|
|
col_span = int(col_span) |
|
|
|
|
|
icell = TableCell( |
|
|
text=cell.text.strip(), |
|
|
row_span=row_span, |
|
|
col_span=col_span, |
|
|
start_row_offset_idx=row_idx, |
|
|
end_row_offset_idx=row_idx + row_span, |
|
|
start_col_offset_idx=col_idx, |
|
|
end_col_offset_idx=col_idx + col_span, |
|
|
col_header=False, |
|
|
row_header=False, |
|
|
) |
|
|
if len(cell.text.strip()) > 0: |
|
|
tcells.append(icell) |
|
|
|
|
|
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[]) |
|
|
|
|
|
for tcell in tcells: |
|
|
data.table_cells.append(tcell) |
|
|
if len(tcells) > 0: |
|
|
|
|
|
|
|
|
doc.add_table(parent=parent_slide, data=data, prov=prov) |
|
|
return |
|
|
|
|
|
def walk_linear(self, pptx_obj, doc) -> DoclingDocument: |
|
|
|
|
|
slide_width = pptx_obj.slide_width |
|
|
slide_height = pptx_obj.slide_height |
|
|
|
|
|
text_content = [] |
|
|
|
|
|
max_levels = 10 |
|
|
parents = {} |
|
|
for i in range(0, max_levels): |
|
|
parents[i] = None |
|
|
|
|
|
|
|
|
for slide_num, slide in enumerate(pptx_obj.slides): |
|
|
slide_ind = pptx_obj.slides.index(slide) |
|
|
parent_slide = doc.add_group( |
|
|
name=f"slide-{slide_ind}", label=GroupLabel.CHAPTER, parent=parents[0] |
|
|
) |
|
|
|
|
|
slide_size = Size(width=slide_width, height=slide_height) |
|
|
parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size) |
|
|
|
|
|
def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size): |
|
|
handle_groups(shape, parent_slide, slide_ind, doc, slide_size) |
|
|
if shape.has_table: |
|
|
|
|
|
self.handle_tables(shape, parent_slide, slide_ind, doc, slide_size) |
|
|
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE: |
|
|
|
|
|
self.handle_pictures( |
|
|
shape, parent_slide, slide_ind, doc, slide_size |
|
|
) |
|
|
|
|
|
if not hasattr(shape, "text"): |
|
|
return |
|
|
if shape.text is None: |
|
|
return |
|
|
if len(shape.text.strip()) == 0: |
|
|
return |
|
|
if not shape.has_text_frame: |
|
|
_log.warning("Warning: shape has text but not text_frame") |
|
|
return |
|
|
|
|
|
self.handle_text_elements( |
|
|
shape, parent_slide, slide_ind, doc, slide_size |
|
|
) |
|
|
return |
|
|
|
|
|
def handle_groups(shape, parent_slide, slide_ind, doc, slide_size): |
|
|
if shape.shape_type == MSO_SHAPE_TYPE.GROUP: |
|
|
for groupedshape in shape.shapes: |
|
|
handle_shapes( |
|
|
groupedshape, parent_slide, slide_ind, doc, slide_size |
|
|
) |
|
|
|
|
|
|
|
|
for shape in slide.shapes: |
|
|
handle_shapes(shape, parent_slide, slide_ind, doc, slide_size) |
|
|
|
|
|
return doc |
|
|
|