Spaces:

ignaciaginting
/

extract_from_doc

Build error

extract_from_doc / app.py

Update app.py

d29af94 verified 7 months ago

1.09 kB

	import streamlit as st
	import fitz # PyMuPDF
	from huggingface_hub import snapshot_download
	import os
	from pdf2image import convert_from_path
	from PIL import Image
	import tempfile

	# Download the model if not already downloaded
	model_dir = "./pdf-extract-kit"
	if not os.path.exists(model_dir):
	snapshot_download(repo_id="opendatalab/pdf-extract-kit-1.0", local_dir=model_dir, max_workers=20)

	st.title("PDF Table Extractor with PDF-Extract-Kit-1.0")

	uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])

	if uploaded_file:
	st.write("Converting PDF to images...")
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
	tmp_pdf.write(uploaded_file.read())
	tmp_pdf_path = tmp_pdf.name

	images = convert_from_path(tmp_pdf_path)

	for i, img in enumerate(images):
	st.image(img, caption=f"Page {i+1}", use_column_width=True)

	# Here you would call the table detection model on each image
	st.info("🛠 Table detection model would run here... (to be implemented)")

	st.success("Done processing PDF!")