Spaces:

Shami96
/

PDF-Data_Extractor

Running

File size: 5,346 Bytes

2e237ce

import sys
import subprocess
from fastapi import UploadFile, File, Form
from typing import Optional, Union
from starlette.responses import Response
from starlette.concurrency import run_in_threadpool
from use_cases.pdf_analysis.analyze_pdf_use_case import AnalyzePDFUseCase
from use_cases.text_extraction.extract_text_use_case import ExtractTextUseCase
from use_cases.toc_extraction.extract_toc_use_case import ExtractTOCUseCase
from use_cases.visualization.create_visualization_use_case import CreateVisualizationUseCase
from use_cases.ocr.process_ocr_use_case import ProcessOCRUseCase
from use_cases.markdown_conversion.convert_to_markdown_use_case import ConvertToMarkdownUseCase
from use_cases.html_conversion.convert_to_html_use_case import ConvertToHtmlUseCase
from adapters.storage.file_system_repository import FileSystemRepository


class FastAPIControllers:
    def __init__(
        self,
        analyze_pdf_use_case: AnalyzePDFUseCase,
        extract_text_use_case: ExtractTextUseCase,
        extract_toc_use_case: ExtractTOCUseCase,
        create_visualization_use_case: CreateVisualizationUseCase,
        process_ocr_use_case: ProcessOCRUseCase,
        convert_to_markdown_use_case: ConvertToMarkdownUseCase,
        convert_to_html_use_case: ConvertToHtmlUseCase,
        file_repository: FileSystemRepository,
    ):
        self.analyze_pdf_use_case = analyze_pdf_use_case
        self.extract_text_use_case = extract_text_use_case
        self.extract_toc_use_case = extract_toc_use_case
        self.create_visualization_use_case = create_visualization_use_case
        self.process_ocr_use_case = process_ocr_use_case
        self.convert_to_markdown_use_case = convert_to_markdown_use_case
        self.convert_to_html_use_case = convert_to_html_use_case
        self.file_repository = file_repository

    async def root(self):
        import torch

        return sys.version + " Using GPU: " + str(torch.cuda.is_available())

    async def info(self):
        return {
            "sys": sys.version,
            "tesseract_version": subprocess.run("tesseract --version", shell=True, text=True, capture_output=True).stdout,
            "ocrmypdf_version": subprocess.run("ocrmypdf --version", shell=True, text=True, capture_output=True).stdout,
            "supported_languages": self.process_ocr_use_case.get_supported_languages(),
        }

    async def error(self):
        raise FileNotFoundError("This is a test error from the error endpoint")

    async def analyze_pdf(
        self, file: UploadFile = File(...), fast: bool = Form(False), parse_tables_and_math: bool = Form(False)
    ):
        return await run_in_threadpool(
            self.analyze_pdf_use_case.execute, file.file.read(), "", parse_tables_and_math, fast, False
        )

    async def analyze_and_save_xml(
        self, file: UploadFile = File(...), xml_file_name: str | None = None, fast: bool = Form(False)
    ):
        if not xml_file_name.endswith(".xml"):
            xml_file_name = f"{xml_file_name}.xml"
        return await run_in_threadpool(self.analyze_pdf_use_case.execute_and_save_xml, file.file.read(), xml_file_name, fast)

    async def get_xml_by_name(self, xml_file_name: str):
        if not xml_file_name.endswith(".xml"):
            xml_file_name = f"{xml_file_name}.xml"
        return await run_in_threadpool(self.file_repository.get_xml, xml_file_name)

    async def get_toc_endpoint(self, file: UploadFile = File(...), fast: bool = Form(False)):
        return await run_in_threadpool(self.extract_toc_use_case.execute, file, fast)

    async def toc_legacy_uwazi_compatible(self, file: UploadFile = File(...)):
        return await run_in_threadpool(self.extract_toc_use_case.execute_uwazi_compatible, file)

    async def get_text_endpoint(self, file: UploadFile = File(...), fast: bool = Form(False), types: str = Form("all")):
        return await run_in_threadpool(self.extract_text_use_case.execute, file, fast, types)

    async def get_visualization_endpoint(self, file: UploadFile = File(...), fast: bool = Form(False)):
        return await run_in_threadpool(self.create_visualization_use_case.execute, file, fast)

    async def ocr_pdf_sync(self, file: UploadFile = File(...), language: str = Form("en")):
        return await run_in_threadpool(self.process_ocr_use_case.execute, file, language)

    async def convert_to_markdown_endpoint(
        self,
        file: UploadFile = File(...),
        fast: bool = Form(False),
        extract_toc: bool = Form(False),
        dpi: int = Form(120),
        output_file: Optional[str] = Form(None),
    ) -> Union[str, Response]:
        return await run_in_threadpool(
            self.convert_to_markdown_use_case.execute,
            file.file.read(),
            fast,
            extract_toc,
            dpi,
            output_file,
        )

    async def convert_to_html_endpoint(
        self,
        file: UploadFile = File(...),
        fast: bool = Form(False),
        extract_toc: bool = Form(False),
        dpi: int = Form(120),
        output_file: Optional[str] = Form(None),
    ) -> Union[str, Response]:
        return await run_in_threadpool(
            self.convert_to_html_use_case.execute,
            file.file.read(),
            fast,
            extract_toc,
            dpi,
            output_file,
        )