Spaces:

marcosremar2
/

docker_mineru

Sleeping

App Files Files Community

marcosremar2 commited on May 3

Commit

f30c298

1 Parent(s): c1e65a1

Enhance FastAPI implementation with better documentation, error handling and examples

Browse files

Files changed (3) hide show

README.md +72 -3
app.py +93 -8
requirements.txt +4 -3

README.md CHANGED Viewed

@@ -9,8 +9,77 @@ app_file: app.py
 pinned: false
 ---
-# MinerU PDF Extractor (Docker Space)
-This Hugging Face Space uses `magic-pdf` to extract structured content from PDFs using FastAPI.
-Send a `POST` request to `/extract` with a PDF file to receive extracted results.

 pinned: false
 ---
+# MinerU PDF Extractor API
+This Hugging Face Space provides a FastAPI-based service that uses `magic-pdf` to extract structured content from PDFs. The service exposes REST endpoints to process PDF files and return extracted text and tables in a structured JSON format.
+## API Endpoints
+### Health Check
+```
+GET /health
+```
+Returns the service status and timestamp.
+### Extract PDF Content
+```
+POST /extract
+```
+Upload a PDF file to extract its text content and tables.
+#### Request
+- Content-Type: multipart/form-data
+- Body: PDF file in the 'file' field
+#### Response
+JSON object containing:
+- Filename
+- Pages with extracted text
+- Tables in Markdown format
+## Usage Examples
+### Using cURL
+```bash
+curl -X POST "https://marcosremar2-docker-mineru.hf.space/extract" \
+  -H "Content-Type: multipart/form-data" \
+  -F "file=@your_document.pdf" \
+  --output result.json
+```
+### Using Python
+```python
+import requests
+url = "https://marcosremar2-docker-mineru.hf.space/extract"
+files = {"file": open("your_document.pdf", "rb")}
+response = requests.post(url, files=files)
+data = response.json()
+# Process the extracted data
+print(f"Filename: {data['result']['filename']}")
+print(f"Number of pages: {len(data['result']['pages'])}")
+```
+## API Documentation
+Once deployed, you can access the auto-generated Swagger documentation at:
+```
+https://marcosremar2-docker-mineru.hf.space/docs
+```
+For ReDoc documentation:
+```
+https://marcosremar2-docker-mineru.hf.space/redoc
+```

app.py CHANGED Viewed

@@ -1,15 +1,76 @@
-from fastapi import FastAPI, UploadFile, File
 from fastapi.responses import JSONResponse
 import magic_pdf
 import tempfile
 import os
 import json
-app = FastAPI()
-@app.post("/extract")
-async def extract(file: UploadFile = File(...)):
     content = await file.read()
     try:
         # Save the uploaded PDF to a temporary file
         with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
@@ -21,6 +82,7 @@ async def extract(file: UploadFile = File(...)):
         # Convert result to dictionary
         output = {
             "pages": []
         }
@@ -36,9 +98,32 @@ async def extract(file: UploadFile = File(...)):
             output["pages"].append(page_data)
-        # Clean up the temporary file
-        os.unlink(temp_pdf_path)
         return {"result": output}
     except Exception as e:
-        return JSONResponse(status_code=500, content={"error": str(e)})

+from fastapi import FastAPI, UploadFile, File, HTTPException
 from fastapi.responses import JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
 import magic_pdf
 import tempfile
 import os
 import json
+import traceback
+import uvicorn
+from datetime import datetime
+from typing import Dict, List, Any, Optional
+# Application metadata
+app_description = """
+# MinerU PDF Processor API
+This API provides PDF processing capabilities using MinerU's magic-pdf library.
+It extracts text content and tables from PDF documents.
+## Features:
+- PDF text extraction
+- Table detection and extraction
+- JSON response for easy integration
+"""
+app = FastAPI(
+    title="MinerU PDF API",
+    description=app_description,
+    version="1.0.0",
+    contact={
+        "name": "PDF Converter Service",
+    },
+)
+# Add CORS middleware to allow cross-origin requests
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Allow all origins
+    allow_credentials=True,
+    allow_methods=["*"],  # Allow all methods
+    allow_headers=["*"],  # Allow all headers
+)
+# Health check endpoint
+@app.get("/health", tags=["Health"])
+async def health_check() -> Dict[str, Any]:
+    """
+    Health check endpoint to verify the service is running.
+    Returns the service status and current time.
+    """
+    return {
+        "status": "healthy",
+        "timestamp": datetime.now().isoformat(),
+        "service": "mineru-pdf-processor"
+    }
+@app.post("/extract", tags=["PDF Processing"])
+async def extract(file: UploadFile = File(...)) -> Dict[str, Any]:
+    """
+    Extract text and tables from a PDF file.
+    Parameters:
+        file: The PDF file to process
+    Returns:
+        A JSON object containing the extracted content with pages, text blocks, and tables
+    """
+    if not file.filename or not file.filename.lower().endswith('.pdf'):
+        raise HTTPException(status_code=400, detail="Invalid file. Please upload a PDF file.")
     content = await file.read()
+    temp_pdf_path = None
     try:
         # Save the uploaded PDF to a temporary file
         with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
         # Convert result to dictionary
         output = {
+            "filename": file.filename,
             "pages": []
         }
             output["pages"].append(page_data)
         return {"result": output}
     except Exception as e:
+        error_detail = str(e)
+        error_trace = traceback.format_exc()
+        # Log the error (would be better with a proper logger)
+        print(f"Error processing PDF: {error_detail}")
+        print(error_trace)
+        return JSONResponse(
+            status_code=500,
+            content={
+                "error": "Error processing PDF",
+                "detail": error_detail,
+                "filename": file.filename if file and hasattr(file, 'filename') else None
+            }
+        )
+    finally:
+        # Clean up the temporary file
+        if temp_pdf_path and os.path.exists(temp_pdf_path):
+            try:
+                os.unlink(temp_pdf_path)
+            except Exception:
+                pass
+if __name__ == "__main__":
+    uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
-fastapi
-uvicorn
 magic-pdf[full]==1.3.10
-python-multipart

+fastapi==0.100.0
+uvicorn==0.23.2
 magic-pdf[full]==1.3.10
+python-multipart==0.0.6
+requests==2.31.0