File size: 5,346 Bytes
2e237ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import sys
import subprocess
from fastapi import UploadFile, File, Form
from typing import Optional, Union
from starlette.responses import Response
from starlette.concurrency import run_in_threadpool
from use_cases.pdf_analysis.analyze_pdf_use_case import AnalyzePDFUseCase
from use_cases.text_extraction.extract_text_use_case import ExtractTextUseCase
from use_cases.toc_extraction.extract_toc_use_case import ExtractTOCUseCase
from use_cases.visualization.create_visualization_use_case import CreateVisualizationUseCase
from use_cases.ocr.process_ocr_use_case import ProcessOCRUseCase
from use_cases.markdown_conversion.convert_to_markdown_use_case import ConvertToMarkdownUseCase
from use_cases.html_conversion.convert_to_html_use_case import ConvertToHtmlUseCase
from adapters.storage.file_system_repository import FileSystemRepository


class FastAPIControllers:
    def __init__(
        self,
        analyze_pdf_use_case: AnalyzePDFUseCase,
        extract_text_use_case: ExtractTextUseCase,
        extract_toc_use_case: ExtractTOCUseCase,
        create_visualization_use_case: CreateVisualizationUseCase,
        process_ocr_use_case: ProcessOCRUseCase,
        convert_to_markdown_use_case: ConvertToMarkdownUseCase,
        convert_to_html_use_case: ConvertToHtmlUseCase,
        file_repository: FileSystemRepository,
    ):
        self.analyze_pdf_use_case = analyze_pdf_use_case
        self.extract_text_use_case = extract_text_use_case
        self.extract_toc_use_case = extract_toc_use_case
        self.create_visualization_use_case = create_visualization_use_case
        self.process_ocr_use_case = process_ocr_use_case
        self.convert_to_markdown_use_case = convert_to_markdown_use_case
        self.convert_to_html_use_case = convert_to_html_use_case
        self.file_repository = file_repository

    async def root(self):
        import torch

        return sys.version + " Using GPU: " + str(torch.cuda.is_available())

    async def info(self):
        return {
            "sys": sys.version,
            "tesseract_version": subprocess.run("tesseract --version", shell=True, text=True, capture_output=True).stdout,
            "ocrmypdf_version": subprocess.run("ocrmypdf --version", shell=True, text=True, capture_output=True).stdout,
            "supported_languages": self.process_ocr_use_case.get_supported_languages(),
        }

    async def error(self):
        raise FileNotFoundError("This is a test error from the error endpoint")

    async def analyze_pdf(
        self, file: UploadFile = File(...), fast: bool = Form(False), parse_tables_and_math: bool = Form(False)
    ):
        return await run_in_threadpool(
            self.analyze_pdf_use_case.execute, file.file.read(), "", parse_tables_and_math, fast, False
        )

    async def analyze_and_save_xml(
        self, file: UploadFile = File(...), xml_file_name: str | None = None, fast: bool = Form(False)
    ):
        if not xml_file_name.endswith(".xml"):
            xml_file_name = f"{xml_file_name}.xml"
        return await run_in_threadpool(self.analyze_pdf_use_case.execute_and_save_xml, file.file.read(), xml_file_name, fast)

    async def get_xml_by_name(self, xml_file_name: str):
        if not xml_file_name.endswith(".xml"):
            xml_file_name = f"{xml_file_name}.xml"
        return await run_in_threadpool(self.file_repository.get_xml, xml_file_name)

    async def get_toc_endpoint(self, file: UploadFile = File(...), fast: bool = Form(False)):
        return await run_in_threadpool(self.extract_toc_use_case.execute, file, fast)

    async def toc_legacy_uwazi_compatible(self, file: UploadFile = File(...)):
        return await run_in_threadpool(self.extract_toc_use_case.execute_uwazi_compatible, file)

    async def get_text_endpoint(self, file: UploadFile = File(...), fast: bool = Form(False), types: str = Form("all")):
        return await run_in_threadpool(self.extract_text_use_case.execute, file, fast, types)

    async def get_visualization_endpoint(self, file: UploadFile = File(...), fast: bool = Form(False)):
        return await run_in_threadpool(self.create_visualization_use_case.execute, file, fast)

    async def ocr_pdf_sync(self, file: UploadFile = File(...), language: str = Form("en")):
        return await run_in_threadpool(self.process_ocr_use_case.execute, file, language)

    async def convert_to_markdown_endpoint(
        self,
        file: UploadFile = File(...),
        fast: bool = Form(False),
        extract_toc: bool = Form(False),
        dpi: int = Form(120),
        output_file: Optional[str] = Form(None),
    ) -> Union[str, Response]:
        return await run_in_threadpool(
            self.convert_to_markdown_use_case.execute,
            file.file.read(),
            fast,
            extract_toc,
            dpi,
            output_file,
        )

    async def convert_to_html_endpoint(
        self,
        file: UploadFile = File(...),
        fast: bool = Form(False),
        extract_toc: bool = Form(False),
        dpi: int = Form(120),
        output_file: Optional[str] = Form(None),
    ) -> Union[str, Response]:
        return await run_in_threadpool(
            self.convert_to_html_use_case.execute,
            file.file.read(),
            fast,
            extract_toc,
            dpi,
            output_file,
        )