Spaces:
Sleeping
Sleeping
| from abc import ABC, abstractmethod | |
| from typing import Optional, Dict, Any, Tuple | |
| from .models import MarkdownGenerationResult | |
| from .html2text import CustomHTML2Text | |
| from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter | |
| import re | |
| from urllib.parse import urljoin | |
| # Pre-compile the regex pattern | |
| LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)') | |
| def fast_urljoin(base: str, url: str) -> str: | |
| """Fast URL joining for common cases.""" | |
| if url.startswith(('http://', 'https://', 'mailto:', '//')): | |
| return url | |
| if url.startswith('/'): | |
| # Handle absolute paths | |
| if base.endswith('/'): | |
| return base[:-1] + url | |
| return base + url | |
| return urljoin(base, url) | |
| class MarkdownGenerationStrategy(ABC): | |
| """Abstract base class for markdown generation strategies.""" | |
| def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None): | |
| self.content_filter = content_filter | |
| self.options = options or {} | |
| def generate_markdown(self, | |
| cleaned_html: str, | |
| base_url: str = "", | |
| html2text_options: Optional[Dict[str, Any]] = None, | |
| content_filter: Optional[RelevantContentFilter] = None, | |
| citations: bool = True, | |
| **kwargs) -> MarkdownGenerationResult: | |
| """Generate markdown from cleaned HTML.""" | |
| pass | |
| class DefaultMarkdownGenerator(MarkdownGenerationStrategy): | |
| """ | |
| Default implementation of markdown generation strategy. | |
| How it works: | |
| 1. Generate raw markdown from cleaned HTML. | |
| 2. Convert links to citations. | |
| 3. Generate fit markdown if content filter is provided. | |
| 4. Return MarkdownGenerationResult. | |
| Args: | |
| content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown. | |
| options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None. | |
| Returns: | |
| MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown. | |
| """ | |
| def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None): | |
| super().__init__(content_filter, options) | |
| def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]: | |
| """ | |
| Convert links in markdown to citations. | |
| How it works: | |
| 1. Find all links in the markdown. | |
| 2. Convert links to citations. | |
| 3. Return converted markdown and references markdown. | |
| Note: | |
| This function uses a regex pattern to find links in markdown. | |
| Args: | |
| markdown (str): Markdown text. | |
| base_url (str): Base URL for URL joins. | |
| Returns: | |
| Tuple[str, str]: Converted markdown and references markdown. | |
| """ | |
| link_map = {} | |
| url_cache = {} # Cache for URL joins | |
| parts = [] | |
| last_end = 0 | |
| counter = 1 | |
| for match in LINK_PATTERN.finditer(markdown): | |
| parts.append(markdown[last_end:match.start()]) | |
| text, url, title = match.groups() | |
| # Use cached URL if available, otherwise compute and cache | |
| if base_url and not url.startswith(('http://', 'https://', 'mailto:')): | |
| if url not in url_cache: | |
| url_cache[url] = fast_urljoin(base_url, url) | |
| url = url_cache[url] | |
| if url not in link_map: | |
| desc = [] | |
| if title: desc.append(title) | |
| if text and text != title: desc.append(text) | |
| link_map[url] = (counter, ": " + " - ".join(desc) if desc else "") | |
| counter += 1 | |
| num = link_map[url][0] | |
| parts.append(f"{text}⟨{num}⟩" if not match.group(0).startswith('!') else f"![{text}⟨{num}⟩]") | |
| last_end = match.end() | |
| parts.append(markdown[last_end:]) | |
| converted_text = ''.join(parts) | |
| # Pre-build reference strings | |
| references = ["\n\n## References\n\n"] | |
| references.extend( | |
| f"⟨{num}⟩ {url}{desc}\n" | |
| for url, (num, desc) in sorted(link_map.items(), key=lambda x: x[1][0]) | |
| ) | |
| return converted_text, ''.join(references) | |
| def generate_markdown(self, | |
| cleaned_html: str, | |
| base_url: str = "", | |
| html2text_options: Optional[Dict[str, Any]] = None, | |
| options: Optional[Dict[str, Any]] = None, | |
| content_filter: Optional[RelevantContentFilter] = None, | |
| citations: bool = True, | |
| **kwargs) -> MarkdownGenerationResult: | |
| """ | |
| Generate markdown with citations from cleaned HTML. | |
| How it works: | |
| 1. Generate raw markdown from cleaned HTML. | |
| 2. Convert links to citations. | |
| 3. Generate fit markdown if content filter is provided. | |
| 4. Return MarkdownGenerationResult. | |
| Args: | |
| cleaned_html (str): Cleaned HTML content. | |
| base_url (str): Base URL for URL joins. | |
| html2text_options (Optional[Dict[str, Any]]): HTML2Text options. | |
| options (Optional[Dict[str, Any]]): Additional options for markdown generation. | |
| content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown. | |
| citations (bool): Whether to generate citations. | |
| Returns: | |
| MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown. | |
| """ | |
| try: | |
| # Initialize HTML2Text with default options for better conversion | |
| h = CustomHTML2Text(baseurl=base_url) | |
| default_options = { | |
| 'body_width': 0, # Disable text wrapping | |
| 'ignore_emphasis': False, | |
| 'ignore_links': False, | |
| 'ignore_images': False, | |
| 'protect_links': True, | |
| 'single_line_break': True, | |
| 'mark_code': True, | |
| 'escape_snob': False | |
| } | |
| # Update with custom options if provided | |
| if html2text_options: | |
| default_options.update(html2text_options) | |
| elif options: | |
| default_options.update(options) | |
| elif self.options: | |
| default_options.update(self.options) | |
| h.update_params(**default_options) | |
| # Ensure we have valid input | |
| if not cleaned_html: | |
| cleaned_html = "" | |
| elif not isinstance(cleaned_html, str): | |
| cleaned_html = str(cleaned_html) | |
| # Generate raw markdown | |
| try: | |
| raw_markdown = h.handle(cleaned_html) | |
| except Exception as e: | |
| raw_markdown = f"Error converting HTML to markdown: {str(e)}" | |
| raw_markdown = raw_markdown.replace(' ```', '```') | |
| # Convert links to citations | |
| markdown_with_citations: str = raw_markdown | |
| references_markdown: str = "" | |
| if citations: | |
| try: | |
| markdown_with_citations, references_markdown = self.convert_links_to_citations( | |
| raw_markdown, base_url | |
| ) | |
| except Exception as e: | |
| markdown_with_citations = raw_markdown | |
| references_markdown = f"Error generating citations: {str(e)}" | |
| # Generate fit markdown if content filter is provided | |
| fit_markdown: Optional[str] = "" | |
| filtered_html: Optional[str] = "" | |
| if content_filter or self.content_filter: | |
| try: | |
| content_filter = content_filter or self.content_filter | |
| filtered_html = content_filter.filter_content(cleaned_html) | |
| filtered_html = '\n'.join('<div>{}</div>'.format(s) for s in filtered_html) | |
| fit_markdown = h.handle(filtered_html) | |
| except Exception as e: | |
| fit_markdown = f"Error generating fit markdown: {str(e)}" | |
| filtered_html = "" | |
| return MarkdownGenerationResult( | |
| raw_markdown=raw_markdown or "", | |
| markdown_with_citations=markdown_with_citations or "", | |
| references_markdown=references_markdown or "", | |
| fit_markdown=fit_markdown or "", | |
| fit_html=filtered_html or "", | |
| ) | |
| except Exception as e: | |
| # If anything fails, return empty strings with error message | |
| error_msg = f"Error in markdown generation: {str(e)}" | |
| return MarkdownGenerationResult( | |
| raw_markdown=error_msg, | |
| markdown_with_citations=error_msg, | |
| references_markdown="", | |
| fit_markdown="", | |
| fit_html="", | |
| ) | |