Spaces:

ignaciaginting
/

extract_from_doc

Build error

App Files Files Community

ignaciaginting commited on May 6

Commit

adf0a0e

verified ·

1 Parent(s): 61daa02

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -51

app.py CHANGED Viewed

@@ -1,53 +1,32 @@
 import streamlit as st
 import os
-import tempfile
-from huggingface_hub import snapshot_download
-from pdf2image import convert_from_path
-from PIL import Image
-import fitz  # PyMuPDF
-# Step 1: Download model if not present
-MODEL_DIR = "./pdf-extract-kit"
-if not os.path.exists(MODEL_DIR):
-    with st.spinner("Downloading model..."):
-        snapshot_download(repo_id="opendatalab/pdf-extract-kit-1.0", local_dir=MODEL_DIR, max_workers=20)
-# Step 2: Import model logic dynamically
-import sys
-sys.path.append(MODEL_DIR + "/inference")
-try:
-    from table_recognizer import TableRecognizer
-except ImportError:
-    st.error("❌ Unable to load TableRecognizer. Check model directory structure.")
-    st.stop()
-# Step 3: Set up recognizer
-table_model = TableRecognizer(
-    model_dir=os.path.join(MODEL_DIR, "models", "table_recognition"),
-    device="cpu"  # Change to 'cuda' if using GPU
-)
-st.title("📄 PDF Table Extractor")
-uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
-if uploaded_file:
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
-        tmp_pdf.write(uploaded_file.read())
-        tmp_pdf_path = tmp_pdf.name
-    images = convert_from_path(tmp_pdf_path)
-    for i, img in enumerate(images):
-        st.subheader(f"Page {i + 1}")
-        st.image(img, caption="Original Page", use_column_width=True)
-        # Step 4: Run Table Recognizer
-        with st.spinner("Extracting tables..."):
-            table_results = table_model(img)  # This assumes model takes a PIL image and returns result
-        if table_results:
-            for idx, table in enumerate(table_results):
-                st.markdown(f"#### Table {idx + 1}")
-                st.dataframe(table["data"])  # Assuming table["data"] is a 2D list or pandas DataFrame
-        else:
-            st.info("No tables detected on this page.")

 import streamlit as st
+from pdf_extract_kit.tasks.ocr import OCRTask
+from pdf_extract_kit.utils.config_loader import load_config
 import os
+# Streamlit app title
+st.title("PDF Table Extraction")
+# File uploader to upload PDF
+uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
+if uploaded_file is not None:
+    # Save the uploaded file to a temporary location
+    with open("temp.pdf", "wb") as f:
+        f.write(uploaded_file.read())
+    # Configuration path for OCR task
+    config_path = "PDF-Extract-Kit/configs/ocr.yaml"  # Updated config path
+    config = load_config(config_path)
+    # Initialize the OCR task
+    task = OCRTask(config)
+    # Perform OCR task on the uploaded PDF
+    extracted_data = task.process("temp.pdf", save_dir="outputs", visualize=True)
+    # Display the extracted values
+    st.write("Extracted Data:")
+    st.write(extracted_data)
+    # Optional: Visualize the result (depending on how the output is generated)
+    # st.image('path_to_visualization_image', caption='Extracted Table', use_column_width=True)