Spaces:
Running
Running
| import sys | |
| import subprocess | |
| from fastapi import UploadFile, File, Form | |
| from typing import Optional, Union | |
| from starlette.responses import Response | |
| from starlette.concurrency import run_in_threadpool | |
| from use_cases.pdf_analysis.analyze_pdf_use_case import AnalyzePDFUseCase | |
| from use_cases.text_extraction.extract_text_use_case import ExtractTextUseCase | |
| from use_cases.toc_extraction.extract_toc_use_case import ExtractTOCUseCase | |
| from use_cases.visualization.create_visualization_use_case import CreateVisualizationUseCase | |
| from use_cases.ocr.process_ocr_use_case import ProcessOCRUseCase | |
| from use_cases.markdown_conversion.convert_to_markdown_use_case import ConvertToMarkdownUseCase | |
| from use_cases.html_conversion.convert_to_html_use_case import ConvertToHtmlUseCase | |
| from adapters.storage.file_system_repository import FileSystemRepository | |
| class FastAPIControllers: | |
| def __init__( | |
| self, | |
| analyze_pdf_use_case: AnalyzePDFUseCase, | |
| extract_text_use_case: ExtractTextUseCase, | |
| extract_toc_use_case: ExtractTOCUseCase, | |
| create_visualization_use_case: CreateVisualizationUseCase, | |
| process_ocr_use_case: ProcessOCRUseCase, | |
| convert_to_markdown_use_case: ConvertToMarkdownUseCase, | |
| convert_to_html_use_case: ConvertToHtmlUseCase, | |
| file_repository: FileSystemRepository, | |
| ): | |
| self.analyze_pdf_use_case = analyze_pdf_use_case | |
| self.extract_text_use_case = extract_text_use_case | |
| self.extract_toc_use_case = extract_toc_use_case | |
| self.create_visualization_use_case = create_visualization_use_case | |
| self.process_ocr_use_case = process_ocr_use_case | |
| self.convert_to_markdown_use_case = convert_to_markdown_use_case | |
| self.convert_to_html_use_case = convert_to_html_use_case | |
| self.file_repository = file_repository | |
| async def root(self): | |
| import torch | |
| return sys.version + " Using GPU: " + str(torch.cuda.is_available()) | |
| async def info(self): | |
| return { | |
| "sys": sys.version, | |
| "tesseract_version": subprocess.run("tesseract --version", shell=True, text=True, capture_output=True).stdout, | |
| "ocrmypdf_version": subprocess.run("ocrmypdf --version", shell=True, text=True, capture_output=True).stdout, | |
| "supported_languages": self.process_ocr_use_case.get_supported_languages(), | |
| } | |
| async def error(self): | |
| raise FileNotFoundError("This is a test error from the error endpoint") | |
| async def analyze_pdf( | |
| self, file: UploadFile = File(...), fast: bool = Form(False), parse_tables_and_math: bool = Form(False) | |
| ): | |
| return await run_in_threadpool( | |
| self.analyze_pdf_use_case.execute, file.file.read(), "", parse_tables_and_math, fast, False | |
| ) | |
| async def analyze_and_save_xml( | |
| self, file: UploadFile = File(...), xml_file_name: str | None = None, fast: bool = Form(False) | |
| ): | |
| if not xml_file_name.endswith(".xml"): | |
| xml_file_name = f"{xml_file_name}.xml" | |
| return await run_in_threadpool(self.analyze_pdf_use_case.execute_and_save_xml, file.file.read(), xml_file_name, fast) | |
| async def get_xml_by_name(self, xml_file_name: str): | |
| if not xml_file_name.endswith(".xml"): | |
| xml_file_name = f"{xml_file_name}.xml" | |
| return await run_in_threadpool(self.file_repository.get_xml, xml_file_name) | |
| async def get_toc_endpoint(self, file: UploadFile = File(...), fast: bool = Form(False)): | |
| return await run_in_threadpool(self.extract_toc_use_case.execute, file, fast) | |
| async def toc_legacy_uwazi_compatible(self, file: UploadFile = File(...)): | |
| return await run_in_threadpool(self.extract_toc_use_case.execute_uwazi_compatible, file) | |
| async def get_text_endpoint(self, file: UploadFile = File(...), fast: bool = Form(False), types: str = Form("all")): | |
| return await run_in_threadpool(self.extract_text_use_case.execute, file, fast, types) | |
| async def get_visualization_endpoint(self, file: UploadFile = File(...), fast: bool = Form(False)): | |
| return await run_in_threadpool(self.create_visualization_use_case.execute, file, fast) | |
| async def ocr_pdf_sync(self, file: UploadFile = File(...), language: str = Form("en")): | |
| return await run_in_threadpool(self.process_ocr_use_case.execute, file, language) | |
| async def convert_to_markdown_endpoint( | |
| self, | |
| file: UploadFile = File(...), | |
| fast: bool = Form(False), | |
| extract_toc: bool = Form(False), | |
| dpi: int = Form(120), | |
| output_file: Optional[str] = Form(None), | |
| ) -> Union[str, Response]: | |
| return await run_in_threadpool( | |
| self.convert_to_markdown_use_case.execute, | |
| file.file.read(), | |
| fast, | |
| extract_toc, | |
| dpi, | |
| output_file, | |
| ) | |
| async def convert_to_html_endpoint( | |
| self, | |
| file: UploadFile = File(...), | |
| fast: bool = Form(False), | |
| extract_toc: bool = Form(False), | |
| dpi: int = Form(120), | |
| output_file: Optional[str] = Form(None), | |
| ) -> Union[str, Response]: | |
| return await run_in_threadpool( | |
| self.convert_to_html_use_case.execute, | |
| file.file.read(), | |
| fast, | |
| extract_toc, | |
| dpi, | |
| output_file, | |
| ) | |