Spaces:
Runtime error
Runtime error
| import aiohttp | |
| import io | |
| from markitdown import MarkItDown | |
| class PDFFetcher: | |
| def __init__(self): | |
| self.md = MarkItDown(enable_plugins=True) | |
| def read_local(self, pdf_path: str) -> str: | |
| result = self.md.convert(pdf_path) | |
| markdown = self.postprocess(result.text_content) | |
| return markdown | |
| def postprocess(self, markdown: str) -> str: | |
| pages = markdown.split("\f") | |
| markdown = "\n".join(pages) | |
| return markdown.strip() | |
| async def fetch(self, pdf_url: str) -> str: | |
| async with aiohttp.ClientSession() as session: | |
| async with session.get(pdf_url) as res: | |
| if res.status != 200: | |
| raise Exception(f"Failed to download PDF: {res.status}") | |
| pdf_content = await res.read() | |
| markdown = self.md.convert_stream(io.BytesIO(pdf_content)).text_content | |
| markdown = self.postprocess(markdown) | |
| return markdown | |
| class HTMLFetcher: | |
| def __init__(self): | |
| self.md = MarkItDown(enable_plugins=True) | |
| async def fetch(self, html_url: str) -> str: | |
| async with aiohttp.ClientSession() as session: | |
| async with session.get(html_url) as res: | |
| if res.status != 200: | |
| raise Exception(f"Failed to download HTML: {res.status}") | |
| data = await res.read() | |
| markdown = self.md.convert_stream(io.BytesIO(data)) | |
| return markdown.text_content | |
| class AutoFetcher: | |
| def __init__(self): | |
| self.pdf_fetcher = PDFFetcher() | |
| self.html_fetcher = HTMLFetcher() | |
| self.md = MarkItDown(enable_plugins=True) | |
| async def fetch(self, url: str) -> str: | |
| async with aiohttp.ClientSession() as session: | |
| async with session.get(url) as res: | |
| if res.status != 200: | |
| raise Exception(f"Failed to download HTML: {res.status}") | |
| data = await res.read() | |
| content_type = res.headers.get( | |
| "Content-Type", | |
| res.headers.get("content-type", "text/plain"), | |
| ) | |
| if "application/pdf" in content_type: | |
| return self.pdf_fetcher.postprocess( | |
| self.md.convert_stream(io.BytesIO(data)).text_content | |
| ) | |
| elif "text/html" in content_type: | |
| return self.md.convert_stream(io.BytesIO(data)).text_content | |
| else: | |
| # plain? | |
| return self.md.convert_stream(io.BytesIO(data)).text_content | |