Spaces:

Mqleet
/

AutoPage

Running

File size: 14,050 Bytes

fcaa164

import logging
import re
from io import BytesIO
from pathlib import Path
from typing import Set, Union

from docling_core.types.doc import (
    DocItemLabel,
    DoclingDocument,
    DocumentOrigin,
    GroupItem,
    GroupLabel,
    ImageRef,
    Size,
    TableCell,
    TableData,
)

from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument

_log = logging.getLogger(__name__)


class AsciiDocBackend(DeclarativeDocumentBackend):
    def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
        super().__init__(in_doc, path_or_stream)

        self.path_or_stream = path_or_stream

        try:
            if isinstance(self.path_or_stream, BytesIO):
                text_stream = self.path_or_stream.getvalue().decode("utf-8")
                self.lines = text_stream.split("\n")
            if isinstance(self.path_or_stream, Path):
                with open(self.path_or_stream, "r", encoding="utf-8") as f:
                    self.lines = f.readlines()
            self.valid = True

        except Exception as e:
            raise RuntimeError(
                f"Could not initialize AsciiDoc backend for file with hash {self.document_hash}."
            ) from e
        return

    def is_valid(self) -> bool:
        return self.valid

    @classmethod
    def supports_pagination(cls) -> bool:
        return False

    def unload(self):
        return

    @classmethod
    def supported_formats(cls) -> Set[InputFormat]:
        return {InputFormat.ASCIIDOC}

    def convert(self) -> DoclingDocument:
        """
        Parses the ASCII into a structured document model.
        """

        origin = DocumentOrigin(
            filename=self.file.name or "file",
            mimetype="text/asciidoc",
            binary_hash=self.document_hash,
        )

        doc = DoclingDocument(name=self.file.stem or "file", origin=origin)

        doc = self._parse(doc)

        return doc

    def _parse(self, doc: DoclingDocument):
        """
        Main function that orchestrates the parsing by yielding components:
        title, section headers, text, lists, and tables.
        """

        content = ""

        in_list = False
        in_table = False

        text_data: list[str] = []
        table_data: list[str] = []
        caption_data: list[str] = []

        # parents: dict[int, Union[DocItem, GroupItem, None]] = {}
        parents: dict[int, Union[GroupItem, None]] = {}
        # indents: dict[int, Union[DocItem, GroupItem, None]] = {}
        indents: dict[int, Union[GroupItem, None]] = {}

        for i in range(0, 10):
            parents[i] = None
            indents[i] = None

        for line in self.lines:
            # line = line.strip()

            # Title
            if self._is_title(line):
                item = self._parse_title(line)
                level = item["level"]

                parents[level] = doc.add_text(
                    text=item["text"], label=DocItemLabel.TITLE
                )

            # Section headers
            elif self._is_section_header(line):
                item = self._parse_section_header(line)
                level = item["level"]

                parents[level] = doc.add_heading(
                    text=item["text"], level=item["level"], parent=parents[level - 1]
                )
                for k, v in parents.items():
                    if k > level:
                        parents[k] = None

            # Lists
            elif self._is_list_item(line):

                _log.debug(f"line: {line}")
                item = self._parse_list_item(line)
                _log.debug(f"parsed list-item: {item}")

                level = self._get_current_level(parents)

                if not in_list:
                    in_list = True

                    parents[level + 1] = doc.add_group(
                        parent=parents[level], name="list", label=GroupLabel.LIST
                    )
                    indents[level + 1] = item["indent"]

                elif in_list and item["indent"] > indents[level]:
                    parents[level + 1] = doc.add_group(
                        parent=parents[level], name="list", label=GroupLabel.LIST
                    )
                    indents[level + 1] = item["indent"]

                elif in_list and item["indent"] < indents[level]:

                    # print(item["indent"], " => ", indents[level])
                    while item["indent"] < indents[level]:
                        # print(item["indent"], " => ", indents[level])
                        parents[level] = None
                        indents[level] = None
                        level -= 1

                doc.add_list_item(
                    item["text"], parent=self._get_current_parent(parents)
                )

            elif in_list and not self._is_list_item(line):
                in_list = False

                level = self._get_current_level(parents)
                parents[level] = None

            # Tables
            elif line.strip() == "|===" and not in_table:  # start of table
                in_table = True

            elif self._is_table_line(line):  # within a table
                in_table = True
                table_data.append(self._parse_table_line(line))

            elif in_table and (
                (not self._is_table_line(line)) or line.strip() == "|==="
            ):  # end of table

                caption = None
                if len(caption_data) > 0:
                    caption = doc.add_text(
                        text=" ".join(caption_data), label=DocItemLabel.CAPTION
                    )

                caption_data = []

                data = self._populate_table_as_grid(table_data)
                doc.add_table(
                    data=data, parent=self._get_current_parent(parents), caption=caption
                )

                in_table = False
                table_data = []

            # Picture
            elif self._is_picture(line):

                caption = None
                if len(caption_data) > 0:
                    caption = doc.add_text(
                        text=" ".join(caption_data), label=DocItemLabel.CAPTION
                    )

                caption_data = []

                item = self._parse_picture(line)

                size = None
                if "width" in item and "height" in item:
                    size = Size(width=int(item["width"]), height=int(item["height"]))

                uri = None
                if (
                    "uri" in item
                    and not item["uri"].startswith("http")
                    and item["uri"].startswith("//")
                ):
                    uri = "file:" + item["uri"]
                elif (
                    "uri" in item
                    and not item["uri"].startswith("http")
                    and item["uri"].startswith("/")
                ):
                    uri = "file:/" + item["uri"]
                elif "uri" in item and not item["uri"].startswith("http"):
                    uri = "file://" + item["uri"]

                image = ImageRef(mimetype="image/png", size=size, dpi=70, uri=uri)
                doc.add_picture(image=image, caption=caption)

            # Caption
            elif self._is_caption(line) and len(caption_data) == 0:
                item = self._parse_caption(line)
                caption_data.append(item["text"])

            elif (
                len(line.strip()) > 0 and len(caption_data) > 0
            ):  # allow multiline captions
                item = self._parse_text(line)
                caption_data.append(item["text"])

            # Plain text
            elif len(line.strip()) == 0 and len(text_data) > 0:
                doc.add_text(
                    text=" ".join(text_data),
                    label=DocItemLabel.PARAGRAPH,
                    parent=self._get_current_parent(parents),
                )
                text_data = []

            elif len(line.strip()) > 0:  # allow multiline texts

                item = self._parse_text(line)
                text_data.append(item["text"])

        if len(text_data) > 0:
            doc.add_text(
                text=" ".join(text_data),
                label=DocItemLabel.PARAGRAPH,
                parent=self._get_current_parent(parents),
            )
            text_data = []

        if in_table and len(table_data) > 0:
            data = self._populate_table_as_grid(table_data)
            doc.add_table(data=data, parent=self._get_current_parent(parents))

            in_table = False
            table_data = []

        return doc

    def _get_current_level(self, parents):
        for k, v in parents.items():
            if v == None and k > 0:
                return k - 1

        return 0

    def _get_current_parent(self, parents):
        for k, v in parents.items():
            if v == None and k > 0:
                return parents[k - 1]

        return None

    #   =========   Title
    def _is_title(self, line):
        return re.match(r"^= ", line)

    def _parse_title(self, line):
        return {"type": "title", "text": line[2:].strip(), "level": 0}

    #   =========   Section headers
    def _is_section_header(self, line):
        return re.match(r"^==+", line)

    def _parse_section_header(self, line):
        match = re.match(r"^(=+)\s+(.*)", line)

        marker = match.group(1)  # The list marker (e.g., "*", "-", "1.")
        text = match.group(2)  # The actual text of the list item

        header_level = marker.count("=")  # number of '=' represents level
        return {
            "type": "header",
            "level": header_level - 1,
            "text": text.strip(),
        }

    #   =========   Lists
    def _is_list_item(self, line):
        return re.match(r"^(\s)*(\*|-|\d+\.|\w+\.) ", line)

    def _parse_list_item(self, line):
        """Extract the item marker (number or bullet symbol) and the text of the item."""

        match = re.match(r"^(\s*)(\*|-|\d+\.)\s+(.*)", line)
        if match:
            indent = match.group(1)
            marker = match.group(2)  # The list marker (e.g., "*", "-", "1.")
            text = match.group(3)  # The actual text of the list item

            if marker == "*" or marker == "-":
                return {
                    "type": "list_item",
                    "marker": marker,
                    "text": text.strip(),
                    "numbered": False,
                    "indent": 0 if indent == None else len(indent),
                }
            else:
                return {
                    "type": "list_item",
                    "marker": marker,
                    "text": text.strip(),
                    "numbered": True,
                    "indent": 0 if indent == None else len(indent),
                }
        else:
            # Fallback if no match
            return {
                "type": "list_item",
                "marker": "-",
                "text": line,
                "numbered": False,
                "indent": 0,
            }

    #   =========   Tables
    def _is_table_line(self, line):
        return re.match(r"^\|.*\|", line)

    def _parse_table_line(self, line):
        # Split table cells and trim extra spaces
        return [cell.strip() for cell in line.split("|") if cell.strip()]

    def _populate_table_as_grid(self, table_data):

        num_rows = len(table_data)

        # Adjust the table data into a grid format
        num_cols = max(len(row) for row in table_data)

        data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
        for row_idx, row in enumerate(table_data):
            # Pad rows with empty strings to match column count
            # grid.append(row + [''] * (max_cols - len(row)))

            for col_idx, text in enumerate(row):
                row_span = 1
                col_span = 1

                cell = TableCell(
                    text=text,
                    row_span=row_span,
                    col_span=col_span,
                    start_row_offset_idx=row_idx,
                    end_row_offset_idx=row_idx + row_span,
                    start_col_offset_idx=col_idx,
                    end_col_offset_idx=col_idx + col_span,
                    col_header=False,
                    row_header=False,
                )
                data.table_cells.append(cell)

        return data

    #   =========   Pictures
    def _is_picture(self, line):
        return re.match(r"^image::", line)

    def _parse_picture(self, line):
        """
        Parse an image macro, extracting its path and attributes.
        Syntax: image::path/to/image.png[Alt Text, width=200, height=150, align=center]
        """
        mtch = re.match(r"^image::(.+)\[(.*)\]$", line)
        if mtch:
            picture_path = mtch.group(1).strip()
            attributes = mtch.group(2).split(",")
            picture_info = {"type": "picture", "uri": picture_path}

            # Extract optional attributes (alt text, width, height, alignment)
            if attributes:
                picture_info["alt"] = attributes[0].strip() if attributes[0] else ""
                for attr in attributes[1:]:
                    key, value = attr.split("=")
                    picture_info[key.strip()] = value.strip()

            return picture_info

        return {"type": "picture", "uri": line}

    #   =========   Captions
    def _is_caption(self, line):
        return re.match(r"^\.(.+)", line)

    def _parse_caption(self, line):
        mtch = re.match(r"^\.(.+)", line)
        if mtch:
            text = mtch.group(1)
            return {"type": "caption", "text": text}

        return {"type": "caption", "text": ""}

    #   =========   Plain text
    def _parse_text(self, line):
        return {"type": "text", "text": line.strip()}