Spaces:
Runtime error
Runtime error
| import os, tempfile, traceback | |
| import gradio as gr | |
| import spaces | |
| import requests | |
| # ---------- Cache & HF Hub settings ---------- | |
| os.environ.setdefault("HF_HOME", "/data/.cache/huggingface") | |
| os.environ.setdefault("HF_HUB_CACHE", "/data/.cache/huggingface/hub") | |
| os.environ.setdefault("TRANSFORMERS_CACHE", "/data/.cache/huggingface/transformers") | |
| os.environ.setdefault("HF_HUB_ENABLE_XET", "0") | |
| os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1") | |
| os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") | |
| for p in (os.environ["HF_HOME"], os.environ["HF_HUB_CACHE"], os.environ["TRANSFORMERS_CACHE"]): | |
| os.makedirs(p, exist_ok=True) | |
| # ---------- Docling imports ---------- | |
| from docling.datamodel.base_models import InputFormat | |
| from docling.document_converter import DocumentConverter, PdfFormatOption | |
| from docling.pipeline.vlm_pipeline import VlmPipeline | |
| # CUDA info (informational) | |
| try: | |
| import torch | |
| HAS_CUDA = torch.cuda.is_available() | |
| torch.set_num_threads(max(1, int(os.environ.get("OMP_NUM_THREADS", "2")))) | |
| except Exception: | |
| HAS_CUDA = False | |
| # Converters | |
| std_converter = DocumentConverter(format_options={InputFormat.PDF: PdfFormatOption()}) | |
| vlm_converter = DocumentConverter(format_options={InputFormat.PDF: PdfFormatOption(pipeline_cls=VlmPipeline)}) | |
| # ---------- Helpers ---------- | |
| def _success(md: str, html: str): | |
| tmpdir = tempfile.gettempdir() | |
| md_path = os.path.join(tmpdir, "output.md") | |
| html_path = os.path.join(tmpdir, "output.html") | |
| with open(md_path, "w", encoding="utf-8") as f: f.write(md) | |
| with open(html_path, "w", encoding="utf-8") as f: f.write(html) | |
| return md, html, md_path, html_path | |
| def _fail(msg: str): | |
| err = f"**Conversion failed**:\n```\n{msg}\n```" | |
| return err, "<pre>" + err + "</pre>", None, None | |
| def _convert_local_path(path: str, use_vlm: bool): | |
| try: | |
| conv = vlm_converter if use_vlm else std_converter | |
| doc = conv.convert(source=path).document | |
| md = doc.export_to_markdown() | |
| html = doc.export_to_html() | |
| return _success(md, html) | |
| except Exception as e: | |
| return _fail(f"{e}\n\n{traceback.format_exc()}") | |
| # ---------- GPU-decorated endpoints ---------- | |
| def run_convert_file(file, mode): | |
| if file is None: | |
| return _fail("No file provided.") | |
| return _convert_local_path(file.name, mode.startswith("VLM")) | |
| def run_convert_url(url, mode): | |
| if not url: | |
| return _fail("No URL provided.") | |
| try: | |
| r = requests.get(url, stream=True, timeout=60) | |
| r.raise_for_status() | |
| fd, tmp_path = tempfile.mkstemp(suffix=".pdf") | |
| with os.fdopen(fd, "wb") as tmp: | |
| for chunk in r.iter_content(chunk_size=1 << 20): | |
| if chunk: | |
| tmp.write(chunk) | |
| except Exception as e: | |
| return _fail(f"Failed to download URL: {e}") | |
| try: | |
| return _convert_local_path(tmp_path, mode.startswith("VLM")) | |
| finally: | |
| try: os.remove(tmp_path) | |
| except: pass | |
| # ---------- UI ---------- | |
| subtitle = "Device: **CUDA (ZeroGPU)**" if HAS_CUDA else "Device: **CPU** (GPU warms on first call)" | |
| with gr.Blocks(title="Granite-Docling 258M β PDF β Markdown/HTML") as demo: | |
| gr.Markdown( | |
| f"""# Granite-Docling 258M β PDF β Markdown / HTML | |
| {subtitle} | |
| **Modes** | |
| - **Standard (faster)** β PDFs with a text layer | |
| - **VLM (Granite β better for complex/scanned)** β scans / heavy tables / formulas | |
| _First call may be slow while models download and ZeroGPU warms. Cache lives in `/data`._ | |
| """ | |
| ) | |
| mode = gr.Radio( | |
| ["Standard (faster)", "VLM (Granite β better for complex/scanned)"], | |
| value="Standard (faster)", label="Mode" | |
| ) | |
| with gr.Tab("Upload PDF"): | |
| fi = gr.File(file_types=[".pdf"], label="PDF") | |
| md_preview = gr.Markdown(label="Markdown Preview") | |
| html_preview = gr.HTML(label="HTML Preview") # <β rendered HTML | |
| dl_md = gr.File(label="Download Markdown (.md)") | |
| dl_html = gr.File(label="Download HTML (.html)") | |
| gr.Button("Convert").click( | |
| fn=run_convert_file, | |
| inputs=[fi, mode], | |
| outputs=[md_preview, html_preview, dl_md, dl_html] | |
| ) | |
| with gr.Tab("Convert from URL"): | |
| url = gr.Textbox(label="Public PDF URL", placeholder="https://.../file.pdf") | |
| md_preview2 = gr.Markdown(label="Markdown Preview") | |
| html_preview2 = gr.HTML(label="HTML Preview") | |
| dl_md2 = gr.File(label="Download Markdown (.md)") | |
| dl_html2 = gr.File(label="Download HTML (.html)") | |
| gr.Button("Convert").click( | |
| fn=run_convert_url, | |
| inputs=[url, mode], | |
| outputs=[md_preview2, html_preview2, dl_md2, dl_html2] | |
| ) | |
| # Bind & queue | |
| demo.queue().launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860))) |