Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, UploadFile, File, HTTPException | |
| from fastapi.responses import JSONResponse | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from fastapi.staticfiles import StaticFiles | |
| import tempfile | |
| import os | |
| import sys | |
| import traceback | |
| from datetime import datetime | |
| from typing import Dict, Any | |
| import shutil | |
| import torch | |
| import asyncio | |
| import logging | |
| from contextlib import asynccontextmanager | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' | |
| ) | |
| logger = logging.getLogger("pdf_converter_api") | |
| # Add the parent directory to sys.path to import convert_pdf_to_md | |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| # Import the initialization function as well | |
| from pdf_converter.convert_pdf_to_md import convert_pdf, initialize_converter | |
| # --- Configuration for output directory --- | |
| # In Docker container, use /app prefix | |
| # Adjusted path assuming the app runs from /app in Docker | |
| base_dir = "/app" # Use /app for Docker environment | |
| if not os.path.exists(base_dir): | |
| # Fallback for local testing (assuming run from project root) | |
| base_dir = "." | |
| out_sub_dir = "docker_mineru/output" | |
| output_dir = os.path.join(base_dir, out_sub_dir) | |
| images_dir = os.path.join(output_dir, "images") | |
| # Create output directory if it doesn't exist | |
| os.makedirs(output_dir, exist_ok=True) | |
| os.makedirs(images_dir, exist_ok=True) | |
| logger.info(f"Using output directory: {output_dir}") | |
| # --- End Configuration --- | |
| # Track initialization status | |
| initialization_successful = False | |
| # --- Lifespan management for model loading --- | |
| async def lifespan(app: FastAPI): | |
| global initialization_successful | |
| # Load the ML model during startup | |
| logger.info("Application startup: Initializing marker converter...") | |
| loop = asyncio.get_event_loop() | |
| # Run in executor to avoid blocking the event loop | |
| try: | |
| # Add timeout to prevent indefinite hanging | |
| await asyncio.wait_for( | |
| loop.run_in_executor(None, initialize_converter), | |
| timeout=300 # 5 minute timeout for initialization | |
| ) | |
| initialization_successful = True | |
| logger.info("Marker converter initialization process finished successfully.") | |
| except asyncio.TimeoutError: | |
| logger.error("Marker converter initialization timed out after 5 minutes.") | |
| initialization_successful = False | |
| except Exception as e: | |
| logger.error(f"Marker converter initialization failed: {e}") | |
| logger.error(traceback.format_exc()) | |
| initialization_successful = False | |
| yield | |
| # Clean up resources if needed during shutdown | |
| logger.info("Application shutdown.") | |
| # Application metadata | |
| app_description = """ | |
| # PDF to Markdown Converter API (Optimized) | |
| This API provides PDF processing capabilities to convert PDF documents to Markdown format using marker. | |
| It pre-loads models for faster processing. | |
| ## Features: | |
| - PDF to Markdown conversion using marker | |
| - Optimized for faster startup and processing | |
| - Simple API interface | |
| """ | |
| app = FastAPI( | |
| title="PDF to Markdown API", | |
| description=app_description, | |
| version="1.1.0", # Version bump | |
| lifespan=lifespan # Add the lifespan manager | |
| ) | |
| # Add CORS middleware | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # Mount the output directory - Adjust mount path to be relative to API URL | |
| # We use output_dir for the actual file path, but /output for the URL path | |
| app.mount("/output", StaticFiles(directory=output_dir), name="output") | |
| # Health check endpoint | |
| async def health_check() -> Dict[str, Any]: | |
| """ | |
| Health check endpoint to verify the service is running. | |
| Returns the service status and current time. | |
| """ | |
| gpu_info = { | |
| "cuda_available": torch.cuda.is_available(), | |
| "device_count": torch.cuda.device_count() if torch.cuda.is_available() else 0, | |
| "device_name": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "N/A", | |
| "current_device": torch.cuda.current_device() if torch.cuda.is_available() else -1, | |
| "memory_allocated": f"{torch.cuda.memory_allocated()/1024**2:.2f} MB" if torch.cuda.is_available() else "N/A", | |
| "memory_reserved": f"{torch.cuda.memory_reserved()/1024**2:.2f} MB" if torch.cuda.is_available() else "N/A", | |
| } | |
| return { | |
| "status": "healthy" if initialization_successful else "degraded", | |
| "timestamp": datetime.now().isoformat(), | |
| "service": "pdf-to-markdown-converter", | |
| "gpu": gpu_info, | |
| "model_initialized": initialization_successful, | |
| "output_directory_used": output_dir # Add info for debugging | |
| } | |
| async def convert(file: UploadFile = File(...)) -> Dict[str, Any]: | |
| """ | |
| Convert a PDF file to markdown using the pre-loaded marker converter. | |
| Parameters: | |
| file: The PDF file to process | |
| Returns: | |
| A JSON object containing the conversion result | |
| """ | |
| # Check if models initialized successfully | |
| if not initialization_successful: | |
| return JSONResponse( | |
| status_code=503, # Service Unavailable | |
| content={ | |
| "error": "Service not ready", | |
| "detail": "The model initialization failed during startup. The service cannot process requests at this time." | |
| } | |
| ) | |
| if not file.filename or not file.filename.lower().endswith('.pdf'): | |
| raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF.") | |
| content = await file.read() | |
| temp_pdf_path = None | |
| try: | |
| # Use a secure temporary directory within the app's writable space | |
| # In Docker, /tmp should be writable by the 'user' | |
| with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False, dir="/tmp") as temp_pdf: | |
| temp_pdf.write(content) | |
| temp_pdf_path = temp_pdf.name | |
| logger.info(f"Temporary PDF saved to: {temp_pdf_path}") | |
| # Get the base name of the file for the output | |
| filename_without_ext = os.path.splitext(os.path.basename(file.filename))[0] | |
| # Use the configured output_dir for saving the markdown file | |
| output_md_file = os.path.join(output_dir, f"{filename_without_ext}.md") | |
| logger.info(f"Output markdown path: {output_md_file}") | |
| # Process the PDF using the pre-loaded converter | |
| md_content = convert_pdf(temp_pdf_path, output_md_file) | |
| # Construct the relative path for the URL response | |
| # This path should correspond to the StaticFiles mount point | |
| relative_output_path = os.path.join("/output", f"{filename_without_ext}.md") | |
| return { | |
| "filename": file.filename, | |
| "status": "success", | |
| # Consider omitting full content in response for performance/size | |
| "markdown_preview": md_content[:1000] + "..." if md_content else "", | |
| "output_file_url": relative_output_path | |
| } | |
| except Exception as e: | |
| error_detail = str(e) | |
| error_trace = traceback.format_exc() | |
| logger.error(f"Error processing PDF '{file.filename if file else 'N/A'}': {error_detail}") | |
| logger.error(error_trace) | |
| return JSONResponse( | |
| status_code=500, | |
| content={ | |
| "error": "Error processing PDF", | |
| "detail": error_detail, | |
| "filename": file.filename if file and hasattr(file, 'filename') else None | |
| } | |
| ) | |
| finally: | |
| # Clean up the temporary file | |
| if temp_pdf_path and os.path.exists(temp_pdf_path): | |
| try: | |
| os.unlink(temp_pdf_path) | |
| logger.info(f"Temporary file {temp_pdf_path} deleted.") | |
| except Exception as unlink_err: | |
| logger.error(f"Error deleting temporary file {temp_pdf_path}: {unlink_err}") | |
| # Remove the old __main__ block if it exists, as CMD in Dockerfile handles startup | |
| # if __name__ == "__main__": | |
| # import uvicorn | |
| # uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=False) |