Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import json | |
| from docling.document_converter import DocumentConverter, PdfFormatOption | |
| from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions | |
| from docling.datamodel.base_models import InputFormat | |
| import spaces | |
| # GPU decorator not really required for Docling OCR, but kept if you want | |
| def convert_document(file, output_format): | |
| # Configure OCR pipeline | |
| pdf_opts = PdfPipelineOptions( | |
| do_ocr=True, | |
| ocr_options=TesseractCliOcrOptions(lang=["eng"]) # or ["eng","ara"] if needed | |
| ) | |
| # Correct way: pass options via format_options | |
| converter = DocumentConverter( | |
| format_options={ | |
| InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts) | |
| } | |
| ) | |
| # Convert document | |
| result = converter.convert(file.name) | |
| # Choose output format safely | |
| if output_format == "Markdown": | |
| converted_text = result.document.export_to_markdown() | |
| elif output_format == "JSON": | |
| converted_text = result.document.export_to_dict() | |
| else: | |
| converted_text = "⚠️ Unsupported format" | |
| # Metadata as JSON-friendly dict | |
| metadata = {"Available Attributes": dir(result.document)} | |
| return converted_text, metadata | |
| with gr.Blocks() as app: | |
| gr.Markdown("# 📄 Document Converter with Docling OCR") | |
| gr.Markdown("Upload a PDF, choose the output format, and get the converted text + metadata.") | |
| with gr.Row(): | |
| file_input = gr.File(label="Upload PDF", file_types=[".pdf"]) | |
| format_input = gr.Radio(["Markdown", "JSON"], label="Choose Output Format") | |
| output_text = gr.Textbox(label="Converted Document", lines=20) | |
| output_metadata = gr.JSON(label="Metadata") | |
| convert_button = gr.Button("Convert") | |
| convert_button.click( | |
| fn=convert_document, | |
| inputs=[file_input, format_input], | |
| outputs=[output_text, output_metadata] | |
| ) | |
| app.launch(debug=True) | |