Spaces:

Mqleet
/

AutoPage

Running

App Files Files Community

AutoPage / docling /backend /html_backend.py

Mqleet

upd code

fcaa164 about 1 month ago

raw

history blame

15.9 kB

	import logging
	from io import BytesIO
	from pathlib import Path
	from typing import Optional, Set, Union

	from bs4 import BeautifulSoup, Tag
	from docling_core.types.doc import (
	DocItemLabel,
	DoclingDocument,
	DocumentOrigin,
	GroupLabel,
	TableCell,
	TableData,
	)

	from docling.backend.abstract_backend import DeclarativeDocumentBackend
	from docling.datamodel.base_models import InputFormat
	from docling.datamodel.document import InputDocument

	_log = logging.getLogger(__name__)


	class HTMLDocumentBackend(DeclarativeDocumentBackend):
	def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
	super().__init__(in_doc, path_or_stream)
	_log.debug("About to init HTML backend...")
	self.soup: Optional[Tag] = None
	# HTML file:
	self.path_or_stream = path_or_stream
	# Initialise the parents for the hierarchy
	self.max_levels = 10
	self.level = 0
	self.parents = {} # type: ignore
	for i in range(0, self.max_levels):
	self.parents[i] = None
	self.labels = {} # type: ignore

	try:
	if isinstance(self.path_or_stream, BytesIO):
	text_stream = self.path_or_stream.getvalue()
	self.soup = BeautifulSoup(text_stream, "html.parser")
	if isinstance(self.path_or_stream, Path):
	with open(self.path_or_stream, "rb") as f:
	html_content = f.read()
	self.soup = BeautifulSoup(html_content, "html.parser")
	except Exception as e:
	raise RuntimeError(
	f"Could not initialize HTML backend for file with hash {self.document_hash}."
	) from e

	def is_valid(self) -> bool:
	return self.soup is not None

	@classmethod
	def supports_pagination(cls) -> bool:
	return False

	def unload(self):
	if isinstance(self.path_or_stream, BytesIO):
	self.path_or_stream.close()

	self.path_or_stream = None

	@classmethod
	def supported_formats(cls) -> Set[InputFormat]:
	return {InputFormat.HTML}

	def convert(self) -> DoclingDocument:
	# access self.path_or_stream to load stuff
	origin = DocumentOrigin(
	filename=self.file.name or "file",
	mimetype="text/html",
	binary_hash=self.document_hash,
	)

	doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
	_log.debug("Trying to convert HTML...")

	if self.is_valid():
	assert self.soup is not None
	content = self.soup.body or self.soup
	# Replace <br> tags with newline characters
	for br in content.find_all("br"):
	br.replace_with("\n")
	doc = self.walk(content, doc)
	else:
	raise RuntimeError(
	f"Cannot convert doc with {self.document_hash} because the backend failed to init."
	)
	return doc

	def walk(self, element: Tag, doc: DoclingDocument):
	try:
	# Iterate over elements in the body of the document
	for idx, element in enumerate(element.children):
	try:
	self.analyse_element(element, idx, doc)
	except Exception as exc_child:

	_log.error(" -> error treating child: ", exc_child)
	_log.error(" => element: ", element, "\n")
	raise exc_child

	except Exception as exc:
	pass

	return doc

	def analyse_element(self, element: Tag, idx: int, doc: DoclingDocument):
	"""
	if element.name!=None:
	_log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
	"""

	if element.name in self.labels:
	self.labels[element.name] += 1
	else:
	self.labels[element.name] = 1

	if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
	self.handle_header(element, idx, doc)
	elif element.name in ["p"]:
	self.handle_paragraph(element, idx, doc)
	elif element.name in ["pre"]:
	self.handle_code(element, idx, doc)
	elif element.name in ["ul", "ol"]:
	self.handle_list(element, idx, doc)
	elif element.name in ["li"]:
	self.handle_listitem(element, idx, doc)
	elif element.name == "table":
	self.handle_table(element, idx, doc)
	elif element.name == "figure":
	self.handle_figure(element, idx, doc)
	elif element.name == "img":
	self.handle_image(element, idx, doc)
	else:
	self.walk(element, doc)

	def get_direct_text(self, item: Tag):
	"""Get the direct text of the <li> element (ignoring nested lists)."""
	text = item.find(string=True, recursive=False)
	if isinstance(text, str):
	return text.strip()

	return ""

	# Function to recursively extract text from all child nodes
	def extract_text_recursively(self, item: Tag):
	result = []

	if isinstance(item, str):
	return [item]

	if item.name not in ["ul", "ol"]:
	try:
	# Iterate over the children (and their text and tails)
	for child in item:
	try:
	# Recursively get the child's text content
	result.extend(self.extract_text_recursively(child))
	except:
	pass
	except:
	_log.warn("item has no children")
	pass

	return "".join(result) + " "

	def handle_header(self, element: Tag, idx: int, doc: DoclingDocument):
	"""Handles header tags (h1, h2, etc.)."""
	hlevel = int(element.name.replace("h", ""))
	slevel = hlevel - 1

	label = DocItemLabel.SECTION_HEADER
	text = element.text.strip()

	if hlevel == 1:
	for key, val in self.parents.items():
	self.parents[key] = None

	self.level = 1
	self.parents[self.level] = doc.add_text(
	parent=self.parents[0], label=DocItemLabel.TITLE, text=text
	)
	else:
	if hlevel > self.level:

	# add invisible group
	for i in range(self.level + 1, hlevel):
	self.parents[i] = doc.add_group(
	name=f"header-{i}",
	label=GroupLabel.SECTION,
	parent=self.parents[i - 1],
	)
	self.level = hlevel

	elif hlevel < self.level:

	# remove the tail
	for key, val in self.parents.items():
	if key > hlevel:
	self.parents[key] = None
	self.level = hlevel

	self.parents[hlevel] = doc.add_heading(
	parent=self.parents[hlevel - 1],
	text=text,
	level=hlevel,
	)

	def handle_code(self, element: Tag, idx: int, doc: DoclingDocument):
	"""Handles monospace code snippets (pre)."""
	if element.text is None:
	return
	text = element.text.strip()
	label = DocItemLabel.CODE
	if len(text) == 0:
	return
	doc.add_code(parent=self.parents[self.level], text=text)

	def handle_paragraph(self, element: Tag, idx: int, doc: DoclingDocument):
	"""Handles paragraph tags (p)."""
	if element.text is None:
	return
	text = element.text.strip()
	label = DocItemLabel.PARAGRAPH
	if len(text) == 0:
	return
	doc.add_text(parent=self.parents[self.level], label=label, text=text)

	def handle_list(self, element: Tag, idx: int, doc: DoclingDocument):
	"""Handles list tags (ul, ol) and their list items."""

	if element.name == "ul":
	# create a list group
	self.parents[self.level + 1] = doc.add_group(
	parent=self.parents[self.level], name="list", label=GroupLabel.LIST
	)
	elif element.name == "ol":
	# create a list group
	self.parents[self.level + 1] = doc.add_group(
	parent=self.parents[self.level],
	name="ordered list",
	label=GroupLabel.ORDERED_LIST,
	)
	self.level += 1

	self.walk(element, doc)

	self.parents[self.level + 1] = None
	self.level -= 1

	def handle_listitem(self, element: Tag, idx: int, doc: DoclingDocument):
	"""Handles listitem tags (li)."""
	nested_lists = element.find(["ul", "ol"])

	parent_list_label = self.parents[self.level].label
	index_in_list = len(self.parents[self.level].children) + 1

	if nested_lists:
	name = element.name
	# Text in list item can be hidden within hierarchy, hence
	# we need to extract it recursively
	text = self.extract_text_recursively(element)
	# Flatten text, remove break lines:
	text = text.replace("\n", "").replace("\r", "")
	text = " ".join(text.split()).strip()

	marker = ""
	enumerated = False
	if parent_list_label == GroupLabel.ORDERED_LIST:
	marker = str(index_in_list)
	enumerated = True

	if len(text) > 0:
	# create a list-item
	self.parents[self.level + 1] = doc.add_list_item(
	text=text,
	enumerated=enumerated,
	marker=marker,
	parent=self.parents[self.level],
	)
	self.level += 1

	self.walk(element, doc)

	self.parents[self.level + 1] = None
	self.level -= 1

	elif isinstance(element.text, str):
	text = element.text.strip()

	marker = ""
	enumerated = False
	if parent_list_label == GroupLabel.ORDERED_LIST:
	marker = f"{str(index_in_list)}."
	enumerated = True
	doc.add_list_item(
	text=text,
	enumerated=enumerated,
	marker=marker,
	parent=self.parents[self.level],
	)
	else:
	_log.warn("list-item has no text: ", element)

	def handle_table(self, element: Tag, idx: int, doc: DoclingDocument):
	"""Handles table tags."""

	nested_tables = element.find("table")
	if nested_tables is not None:
	_log.warn("detected nested tables: skipping for now")
	return

	# Count the number of rows (number of <tr> elements)
	num_rows = len(element.find_all("tr"))

	# Find the number of columns (taking into account colspan)
	num_cols = 0
	for row in element.find_all("tr"):
	col_count = 0
	for cell in row.find_all(["td", "th"]):
	colspan = int(cell.get("colspan", 1))
	col_count += colspan
	num_cols = max(num_cols, col_count)

	grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]

	data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])

	# Iterate over the rows in the table
	for row_idx, row in enumerate(element.find_all("tr")):

	# For each row, find all the column cells (both <td> and <th>)
	cells = row.find_all(["td", "th"])

	# Check if each cell in the row is a header -> means it is a column header
	col_header = True
	for j, html_cell in enumerate(cells):
	if html_cell.name == "td":
	col_header = False

	col_idx = 0
	# Extract and print the text content of each cell
	for _, html_cell in enumerate(cells):

	text = html_cell.text
	try:
	text = self.extract_table_cell_text(html_cell)
	except Exception as exc:
	_log.warn("exception: ", exc)
	exit(-1)

	# label = html_cell.name

	col_span = int(html_cell.get("colspan", 1))
	row_span = int(html_cell.get("rowspan", 1))

	while grid[row_idx][col_idx] is not None:
	col_idx += 1
	for r in range(row_span):
	for c in range(col_span):
	grid[row_idx + r][col_idx + c] = text

	cell = TableCell(
	text=text,
	row_span=row_span,
	col_span=col_span,
	start_row_offset_idx=row_idx,
	end_row_offset_idx=row_idx + row_span,
	start_col_offset_idx=col_idx,
	end_col_offset_idx=col_idx + col_span,
	col_header=col_header,
	row_header=((not col_header) and html_cell.name == "th"),
	)
	data.table_cells.append(cell)

	doc.add_table(data=data, parent=self.parents[self.level])

	def get_list_text(self, list_element: Tag, level=0):
	"""Recursively extract text from <ul> or <ol> with proper indentation."""
	result = []
	bullet_char = "*" # Default bullet character for unordered lists

	if list_element.name == "ol": # For ordered lists, use numbers
	for i, li in enumerate(list_element.find_all("li", recursive=False), 1):
	# Add numbering for ordered lists
	result.append(f"{' ' * level}{i}. {li.get_text(strip=True)}")
	# Handle nested lists
	nested_list = li.find(["ul", "ol"])
	if nested_list:
	result.extend(self.get_list_text(nested_list, level + 1))
	elif list_element.name == "ul": # For unordered lists, use bullet points
	for li in list_element.find_all("li", recursive=False):
	# Add bullet points for unordered lists
	result.append(
	f"{' ' * level}{bullet_char} {li.get_text(strip=True)}"
	)
	# Handle nested lists
	nested_list = li.find(["ul", "ol"])
	if nested_list:
	result.extend(self.get_list_text(nested_list, level + 1))

	return result

	def extract_table_cell_text(self, cell: Tag):
	"""Extract text from a table cell, including lists with indents."""
	contains_lists = cell.find(["ul", "ol"])
	if contains_lists is None:
	return cell.text
	else:
	_log.debug(
	"should extract the content correctly for table-cells with lists ..."
	)
	return cell.text

	def handle_figure(self, element: Tag, idx: int, doc: DoclingDocument):
	"""Handles image tags (img)."""

	# Extract the image URI from the <img> tag
	# image_uri = root.xpath('//figure//img/@src')[0]

	contains_captions = element.find(["figcaption"])
	if contains_captions is None:
	doc.add_picture(parent=self.parents[self.level], caption=None)

	else:
	texts = []
	for item in contains_captions:
	texts.append(item.text)

	fig_caption = doc.add_text(
	label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
	)
	doc.add_picture(
	parent=self.parents[self.level],
	caption=fig_caption,
	)

	def handle_image(self, element: Tag, idx, doc: DoclingDocument):
	"""Handles image tags (img)."""
	doc.add_picture(parent=self.parents[self.level], caption=None)