PDF-Data_Extractor / src /adapters /web /fastapi_controllers.py
Wasim
Sync: robust vehicle parser + full project
2e237ce
import sys
import subprocess
from fastapi import UploadFile, File, Form
from typing import Optional, Union
from starlette.responses import Response
from starlette.concurrency import run_in_threadpool
from use_cases.pdf_analysis.analyze_pdf_use_case import AnalyzePDFUseCase
from use_cases.text_extraction.extract_text_use_case import ExtractTextUseCase
from use_cases.toc_extraction.extract_toc_use_case import ExtractTOCUseCase
from use_cases.visualization.create_visualization_use_case import CreateVisualizationUseCase
from use_cases.ocr.process_ocr_use_case import ProcessOCRUseCase
from use_cases.markdown_conversion.convert_to_markdown_use_case import ConvertToMarkdownUseCase
from use_cases.html_conversion.convert_to_html_use_case import ConvertToHtmlUseCase
from adapters.storage.file_system_repository import FileSystemRepository
class FastAPIControllers:
def __init__(
self,
analyze_pdf_use_case: AnalyzePDFUseCase,
extract_text_use_case: ExtractTextUseCase,
extract_toc_use_case: ExtractTOCUseCase,
create_visualization_use_case: CreateVisualizationUseCase,
process_ocr_use_case: ProcessOCRUseCase,
convert_to_markdown_use_case: ConvertToMarkdownUseCase,
convert_to_html_use_case: ConvertToHtmlUseCase,
file_repository: FileSystemRepository,
):
self.analyze_pdf_use_case = analyze_pdf_use_case
self.extract_text_use_case = extract_text_use_case
self.extract_toc_use_case = extract_toc_use_case
self.create_visualization_use_case = create_visualization_use_case
self.process_ocr_use_case = process_ocr_use_case
self.convert_to_markdown_use_case = convert_to_markdown_use_case
self.convert_to_html_use_case = convert_to_html_use_case
self.file_repository = file_repository
async def root(self):
import torch
return sys.version + " Using GPU: " + str(torch.cuda.is_available())
async def info(self):
return {
"sys": sys.version,
"tesseract_version": subprocess.run("tesseract --version", shell=True, text=True, capture_output=True).stdout,
"ocrmypdf_version": subprocess.run("ocrmypdf --version", shell=True, text=True, capture_output=True).stdout,
"supported_languages": self.process_ocr_use_case.get_supported_languages(),
}
async def error(self):
raise FileNotFoundError("This is a test error from the error endpoint")
async def analyze_pdf(
self, file: UploadFile = File(...), fast: bool = Form(False), parse_tables_and_math: bool = Form(False)
):
return await run_in_threadpool(
self.analyze_pdf_use_case.execute, file.file.read(), "", parse_tables_and_math, fast, False
)
async def analyze_and_save_xml(
self, file: UploadFile = File(...), xml_file_name: str | None = None, fast: bool = Form(False)
):
if not xml_file_name.endswith(".xml"):
xml_file_name = f"{xml_file_name}.xml"
return await run_in_threadpool(self.analyze_pdf_use_case.execute_and_save_xml, file.file.read(), xml_file_name, fast)
async def get_xml_by_name(self, xml_file_name: str):
if not xml_file_name.endswith(".xml"):
xml_file_name = f"{xml_file_name}.xml"
return await run_in_threadpool(self.file_repository.get_xml, xml_file_name)
async def get_toc_endpoint(self, file: UploadFile = File(...), fast: bool = Form(False)):
return await run_in_threadpool(self.extract_toc_use_case.execute, file, fast)
async def toc_legacy_uwazi_compatible(self, file: UploadFile = File(...)):
return await run_in_threadpool(self.extract_toc_use_case.execute_uwazi_compatible, file)
async def get_text_endpoint(self, file: UploadFile = File(...), fast: bool = Form(False), types: str = Form("all")):
return await run_in_threadpool(self.extract_text_use_case.execute, file, fast, types)
async def get_visualization_endpoint(self, file: UploadFile = File(...), fast: bool = Form(False)):
return await run_in_threadpool(self.create_visualization_use_case.execute, file, fast)
async def ocr_pdf_sync(self, file: UploadFile = File(...), language: str = Form("en")):
return await run_in_threadpool(self.process_ocr_use_case.execute, file, language)
async def convert_to_markdown_endpoint(
self,
file: UploadFile = File(...),
fast: bool = Form(False),
extract_toc: bool = Form(False),
dpi: int = Form(120),
output_file: Optional[str] = Form(None),
) -> Union[str, Response]:
return await run_in_threadpool(
self.convert_to_markdown_use_case.execute,
file.file.read(),
fast,
extract_toc,
dpi,
output_file,
)
async def convert_to_html_endpoint(
self,
file: UploadFile = File(...),
fast: bool = Form(False),
extract_toc: bool = Form(False),
dpi: int = Form(120),
output_file: Optional[str] = Form(None),
) -> Union[str, Response]:
return await run_in_threadpool(
self.convert_to_html_use_case.execute,
file.file.read(),
fast,
extract_toc,
dpi,
output_file,
)