Spaces:

Mqleet
/

AutoPage

Running

App Files Files Community

AutoPage / docling /backend /asciidoc_backend.py

Mqleet

upd code

fcaa164 about 1 month ago

raw

history blame

14.1 kB

	import logging
	import re
	from io import BytesIO
	from pathlib import Path
	from typing import Set, Union

	from docling_core.types.doc import (
	DocItemLabel,
	DoclingDocument,
	DocumentOrigin,
	GroupItem,
	GroupLabel,
	ImageRef,
	Size,
	TableCell,
	TableData,
	)

	from docling.backend.abstract_backend import DeclarativeDocumentBackend
	from docling.datamodel.base_models import InputFormat
	from docling.datamodel.document import InputDocument

	_log = logging.getLogger(__name__)


	class AsciiDocBackend(DeclarativeDocumentBackend):
	def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
	super().__init__(in_doc, path_or_stream)

	self.path_or_stream = path_or_stream

	try:
	if isinstance(self.path_or_stream, BytesIO):
	text_stream = self.path_or_stream.getvalue().decode("utf-8")
	self.lines = text_stream.split("\n")
	if isinstance(self.path_or_stream, Path):
	with open(self.path_or_stream, "r", encoding="utf-8") as f:
	self.lines = f.readlines()
	self.valid = True

	except Exception as e:
	raise RuntimeError(
	f"Could not initialize AsciiDoc backend for file with hash {self.document_hash}."
	) from e
	return

	def is_valid(self) -> bool:
	return self.valid

	@classmethod
	def supports_pagination(cls) -> bool:
	return False

	def unload(self):
	return

	@classmethod
	def supported_formats(cls) -> Set[InputFormat]:
	return {InputFormat.ASCIIDOC}

	def convert(self) -> DoclingDocument:
	"""
	Parses the ASCII into a structured document model.
	"""

	origin = DocumentOrigin(
	filename=self.file.name or "file",
	mimetype="text/asciidoc",
	binary_hash=self.document_hash,
	)

	doc = DoclingDocument(name=self.file.stem or "file", origin=origin)

	doc = self._parse(doc)

	return doc

	def _parse(self, doc: DoclingDocument):
	"""
	Main function that orchestrates the parsing by yielding components:
	title, section headers, text, lists, and tables.
	"""

	content = ""

	in_list = False
	in_table = False

	text_data: list[str] = []
	table_data: list[str] = []
	caption_data: list[str] = []

	# parents: dict[int, Union[DocItem, GroupItem, None]] = {}
	parents: dict[int, Union[GroupItem, None]] = {}
	# indents: dict[int, Union[DocItem, GroupItem, None]] = {}
	indents: dict[int, Union[GroupItem, None]] = {}

	for i in range(0, 10):
	parents[i] = None
	indents[i] = None

	for line in self.lines:
	# line = line.strip()

	# Title
	if self._is_title(line):
	item = self._parse_title(line)
	level = item["level"]

	parents[level] = doc.add_text(
	text=item["text"], label=DocItemLabel.TITLE
	)

	# Section headers
	elif self._is_section_header(line):
	item = self._parse_section_header(line)
	level = item["level"]

	parents[level] = doc.add_heading(
	text=item["text"], level=item["level"], parent=parents[level - 1]
	)
	for k, v in parents.items():
	if k > level:
	parents[k] = None

	# Lists
	elif self._is_list_item(line):

	_log.debug(f"line: {line}")
	item = self._parse_list_item(line)
	_log.debug(f"parsed list-item: {item}")

	level = self._get_current_level(parents)

	if not in_list:
	in_list = True

	parents[level + 1] = doc.add_group(
	parent=parents[level], name="list", label=GroupLabel.LIST
	)
	indents[level + 1] = item["indent"]

	elif in_list and item["indent"] > indents[level]:
	parents[level + 1] = doc.add_group(
	parent=parents[level], name="list", label=GroupLabel.LIST
	)
	indents[level + 1] = item["indent"]

	elif in_list and item["indent"] < indents[level]:

	# print(item["indent"], " => ", indents[level])
	while item["indent"] < indents[level]:
	# print(item["indent"], " => ", indents[level])
	parents[level] = None
	indents[level] = None
	level -= 1

	doc.add_list_item(
	item["text"], parent=self._get_current_parent(parents)
	)

	elif in_list and not self._is_list_item(line):
	in_list = False

	level = self._get_current_level(parents)
	parents[level] = None

	# Tables
	elif line.strip() == "\|===" and not in_table: # start of table
	in_table = True

	elif self._is_table_line(line): # within a table
	in_table = True
	table_data.append(self._parse_table_line(line))

	elif in_table and (
	(not self._is_table_line(line)) or line.strip() == "\|==="
	): # end of table

	caption = None
	if len(caption_data) > 0:
	caption = doc.add_text(
	text=" ".join(caption_data), label=DocItemLabel.CAPTION
	)

	caption_data = []

	data = self._populate_table_as_grid(table_data)
	doc.add_table(
	data=data, parent=self._get_current_parent(parents), caption=caption
	)

	in_table = False
	table_data = []

	# Picture
	elif self._is_picture(line):

	caption = None
	if len(caption_data) > 0:
	caption = doc.add_text(
	text=" ".join(caption_data), label=DocItemLabel.CAPTION
	)

	caption_data = []

	item = self._parse_picture(line)

	size = None
	if "width" in item and "height" in item:
	size = Size(width=int(item["width"]), height=int(item["height"]))

	uri = None
	if (
	"uri" in item
	and not item["uri"].startswith("http")
	and item["uri"].startswith("//")
	):
	uri = "file:" + item["uri"]
	elif (
	"uri" in item
	and not item["uri"].startswith("http")
	and item["uri"].startswith("/")
	):
	uri = "file:/" + item["uri"]
	elif "uri" in item and not item["uri"].startswith("http"):
	uri = "file://" + item["uri"]

	image = ImageRef(mimetype="image/png", size=size, dpi=70, uri=uri)
	doc.add_picture(image=image, caption=caption)

	# Caption
	elif self._is_caption(line) and len(caption_data) == 0:
	item = self._parse_caption(line)
	caption_data.append(item["text"])

	elif (
	len(line.strip()) > 0 and len(caption_data) > 0
	): # allow multiline captions
	item = self._parse_text(line)
	caption_data.append(item["text"])

	# Plain text
	elif len(line.strip()) == 0 and len(text_data) > 0:
	doc.add_text(
	text=" ".join(text_data),
	label=DocItemLabel.PARAGRAPH,
	parent=self._get_current_parent(parents),
	)
	text_data = []

	elif len(line.strip()) > 0: # allow multiline texts

	item = self._parse_text(line)
	text_data.append(item["text"])

	if len(text_data) > 0:
	doc.add_text(
	text=" ".join(text_data),
	label=DocItemLabel.PARAGRAPH,
	parent=self._get_current_parent(parents),
	)
	text_data = []

	if in_table and len(table_data) > 0:
	data = self._populate_table_as_grid(table_data)
	doc.add_table(data=data, parent=self._get_current_parent(parents))

	in_table = False
	table_data = []

	return doc

	def _get_current_level(self, parents):
	for k, v in parents.items():
	if v == None and k > 0:
	return k - 1

	return 0

	def _get_current_parent(self, parents):
	for k, v in parents.items():
	if v == None and k > 0:
	return parents[k - 1]

	return None

	# ========= Title
	def _is_title(self, line):
	return re.match(r"^= ", line)

	def _parse_title(self, line):
	return {"type": "title", "text": line[2:].strip(), "level": 0}

	# ========= Section headers
	def _is_section_header(self, line):
	return re.match(r"^==+", line)

	def _parse_section_header(self, line):
	match = re.match(r"^(=+)\s+(.*)", line)

	marker = match.group(1) # The list marker (e.g., "*", "-", "1.")
	text = match.group(2) # The actual text of the list item

	header_level = marker.count("=") # number of '=' represents level
	return {
	"type": "header",
	"level": header_level - 1,
	"text": text.strip(),
	}

	# ========= Lists
	def _is_list_item(self, line):
	return re.match(r"^(\s)(\\|-\|\d+\.\|\w+\.) ", line)

	def _parse_list_item(self, line):
	"""Extract the item marker (number or bullet symbol) and the text of the item."""

	match = re.match(r"^(\s)(\\|-\|\d+\.)\s+(.*)", line)
	if match:
	indent = match.group(1)
	marker = match.group(2) # The list marker (e.g., "*", "-", "1.")
	text = match.group(3) # The actual text of the list item

	if marker == "*" or marker == "-":
	return {
	"type": "list_item",
	"marker": marker,
	"text": text.strip(),
	"numbered": False,
	"indent": 0 if indent == None else len(indent),
	}
	else:
	return {
	"type": "list_item",
	"marker": marker,
	"text": text.strip(),
	"numbered": True,
	"indent": 0 if indent == None else len(indent),
	}
	else:
	# Fallback if no match
	return {
	"type": "list_item",
	"marker": "-",
	"text": line,
	"numbered": False,
	"indent": 0,
	}

	# ========= Tables
	def _is_table_line(self, line):
	return re.match(r"^\\|.*\\|", line)

	def _parse_table_line(self, line):
	# Split table cells and trim extra spaces
	return [cell.strip() for cell in line.split("\|") if cell.strip()]

	def _populate_table_as_grid(self, table_data):

	num_rows = len(table_data)

	# Adjust the table data into a grid format
	num_cols = max(len(row) for row in table_data)

	data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
	for row_idx, row in enumerate(table_data):
	# Pad rows with empty strings to match column count
	# grid.append(row + [''] * (max_cols - len(row)))

	for col_idx, text in enumerate(row):
	row_span = 1
	col_span = 1

	cell = TableCell(
	text=text,
	row_span=row_span,
	col_span=col_span,
	start_row_offset_idx=row_idx,
	end_row_offset_idx=row_idx + row_span,
	start_col_offset_idx=col_idx,
	end_col_offset_idx=col_idx + col_span,
	col_header=False,
	row_header=False,
	)
	data.table_cells.append(cell)

	return data

	# ========= Pictures
	def _is_picture(self, line):
	return re.match(r"^image::", line)

	def _parse_picture(self, line):
	"""
	Parse an image macro, extracting its path and attributes.
	Syntax: image::path/to/image.png[Alt Text, width=200, height=150, align=center]
	"""
	mtch = re.match(r"^image::(.+)\[(.*)\]$", line)
	if mtch:
	picture_path = mtch.group(1).strip()
	attributes = mtch.group(2).split(",")
	picture_info = {"type": "picture", "uri": picture_path}

	# Extract optional attributes (alt text, width, height, alignment)
	if attributes:
	picture_info["alt"] = attributes[0].strip() if attributes[0] else ""
	for attr in attributes[1:]:
	key, value = attr.split("=")
	picture_info[key.strip()] = value.strip()

	return picture_info

	return {"type": "picture", "uri": line}

	# ========= Captions
	def _is_caption(self, line):
	return re.match(r"^\.(.+)", line)

	def _parse_caption(self, line):
	mtch = re.match(r"^\.(.+)", line)
	if mtch:
	text = mtch.group(1)
	return {"type": "caption", "text": text}

	return {"type": "caption", "text": ""}

	# ========= Plain text
	def _parse_text(self, line):
	return {"type": "text", "text": line.strip()}