Spaces:

Shami96
/

PDF-Data_Extractor

Running

PDF-Data_Extractor / src /adapters /web /fastapi_controllers.py

Wasim

Sync: robust vehicle parser + full project

2e237ce 3 months ago

5.35 kB

	import sys
	import subprocess
	from fastapi import UploadFile, File, Form
	from typing import Optional, Union
	from starlette.responses import Response
	from starlette.concurrency import run_in_threadpool
	from use_cases.pdf_analysis.analyze_pdf_use_case import AnalyzePDFUseCase
	from use_cases.text_extraction.extract_text_use_case import ExtractTextUseCase
	from use_cases.toc_extraction.extract_toc_use_case import ExtractTOCUseCase
	from use_cases.visualization.create_visualization_use_case import CreateVisualizationUseCase
	from use_cases.ocr.process_ocr_use_case import ProcessOCRUseCase
	from use_cases.markdown_conversion.convert_to_markdown_use_case import ConvertToMarkdownUseCase
	from use_cases.html_conversion.convert_to_html_use_case import ConvertToHtmlUseCase
	from adapters.storage.file_system_repository import FileSystemRepository


	class FastAPIControllers:
	def __init__(
	self,
	analyze_pdf_use_case: AnalyzePDFUseCase,
	extract_text_use_case: ExtractTextUseCase,
	extract_toc_use_case: ExtractTOCUseCase,
	create_visualization_use_case: CreateVisualizationUseCase,
	process_ocr_use_case: ProcessOCRUseCase,
	convert_to_markdown_use_case: ConvertToMarkdownUseCase,
	convert_to_html_use_case: ConvertToHtmlUseCase,
	file_repository: FileSystemRepository,
	):
	self.analyze_pdf_use_case = analyze_pdf_use_case
	self.extract_text_use_case = extract_text_use_case
	self.extract_toc_use_case = extract_toc_use_case
	self.create_visualization_use_case = create_visualization_use_case
	self.process_ocr_use_case = process_ocr_use_case
	self.convert_to_markdown_use_case = convert_to_markdown_use_case
	self.convert_to_html_use_case = convert_to_html_use_case
	self.file_repository = file_repository

	async def root(self):
	import torch

	return sys.version + " Using GPU: " + str(torch.cuda.is_available())

	async def info(self):
	return {
	"sys": sys.version,
	"tesseract_version": subprocess.run("tesseract --version", shell=True, text=True, capture_output=True).stdout,
	"ocrmypdf_version": subprocess.run("ocrmypdf --version", shell=True, text=True, capture_output=True).stdout,
	"supported_languages": self.process_ocr_use_case.get_supported_languages(),
	}

	async def error(self):
	raise FileNotFoundError("This is a test error from the error endpoint")

	async def analyze_pdf(
	self, file: UploadFile = File(...), fast: bool = Form(False), parse_tables_and_math: bool = Form(False)
	):
	return await run_in_threadpool(
	self.analyze_pdf_use_case.execute, file.file.read(), "", parse_tables_and_math, fast, False
	)

	async def analyze_and_save_xml(
	self, file: UploadFile = File(...), xml_file_name: str \| None = None, fast: bool = Form(False)
	):
	if not xml_file_name.endswith(".xml"):
	xml_file_name = f"{xml_file_name}.xml"
	return await run_in_threadpool(self.analyze_pdf_use_case.execute_and_save_xml, file.file.read(), xml_file_name, fast)

	async def get_xml_by_name(self, xml_file_name: str):
	if not xml_file_name.endswith(".xml"):
	xml_file_name = f"{xml_file_name}.xml"
	return await run_in_threadpool(self.file_repository.get_xml, xml_file_name)

	async def get_toc_endpoint(self, file: UploadFile = File(...), fast: bool = Form(False)):
	return await run_in_threadpool(self.extract_toc_use_case.execute, file, fast)

	async def toc_legacy_uwazi_compatible(self, file: UploadFile = File(...)):
	return await run_in_threadpool(self.extract_toc_use_case.execute_uwazi_compatible, file)

	async def get_text_endpoint(self, file: UploadFile = File(...), fast: bool = Form(False), types: str = Form("all")):
	return await run_in_threadpool(self.extract_text_use_case.execute, file, fast, types)

	async def get_visualization_endpoint(self, file: UploadFile = File(...), fast: bool = Form(False)):
	return await run_in_threadpool(self.create_visualization_use_case.execute, file, fast)

	async def ocr_pdf_sync(self, file: UploadFile = File(...), language: str = Form("en")):
	return await run_in_threadpool(self.process_ocr_use_case.execute, file, language)

	async def convert_to_markdown_endpoint(
	self,
	file: UploadFile = File(...),
	fast: bool = Form(False),
	extract_toc: bool = Form(False),
	dpi: int = Form(120),
	output_file: Optional[str] = Form(None),
	) -> Union[str, Response]:
	return await run_in_threadpool(
	self.convert_to_markdown_use_case.execute,
	file.file.read(),
	fast,
	extract_toc,
	dpi,
	output_file,
	)

	async def convert_to_html_endpoint(
	self,
	file: UploadFile = File(...),
	fast: bool = Form(False),
	extract_toc: bool = Form(False),
	dpi: int = Form(120),
	output_file: Optional[str] = Form(None),
	) -> Union[str, Response]:
	return await run_in_threadpool(
	self.convert_to_html_use_case.execute,
	file.file.read(),
	fast,
	extract_toc,
	dpi,
	output_file,
	)