Spaces:

aabdullah27
/

SmolDocling-OCR-App

Running

App Files Files Community

Muhammad Abdullah commited on Mar 19

Commit

80e8620

verified ·

1 Parent(s): 2bd090b

Upload 2 files

Browse files

Files changed (2) hide show

app.py +574 -0
requirements.txt +8 -0

app.py ADDED Viewed

	@@ -0,0 +1,574 @@

+import streamlit as st
+import os
+import time
+import torch
+import tempfile
+from PIL import Image
+from dotenv import load_dotenv
+import logging
+from datetime import datetime
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Load environment variables
+load_dotenv()
+HF_TOKEN = os.getenv("HF_TOKEN")
+CACHE_DIR = os.getenv("CACHE_DIR", os.path.join(tempfile.gettempdir(), "smoldocling_cache"))
+# Ensure cache directory exists
+os.makedirs(CACHE_DIR, exist_ok=True)
+# Import for Transformers approach
+try:
+    from transformers import AutoProcessor, AutoModelForVision2Seq
+    from huggingface_hub import login
+    transformers_available = True
+except ImportError:
+    transformers_available = False
+try:
+    from docling_core.types.doc import DoclingDocument
+    from docling_core.types.doc.document import DocTagsDocument
+    docling_available = True
+except ImportError:
+    docling_available = False
+# Global variables for model caching
+processor = None
+model = None
+def check_dependencies():
+    """Check if all required dependencies are installed"""
+    missing = []
+    if not transformers_available:
+        missing.append("transformers huggingface_hub")
+    if not docling_available:
+        missing.append("docling-core")
+    return missing
+def get_available_devices():
+    """Get available processing devices"""
+    devices = ["cpu"]
+    if torch.cuda.is_available():
+        cuda_count = torch.cuda.device_count()
+        for i in range(cuda_count):
+            devices.append(f"cuda:{i} ({torch.cuda.get_device_name(i)})")
+    return devices
+def get_device_from_selection(selection):
+    """Convert user-friendly device selection to torch device"""
+    if selection.startswith("cuda:"):
+        return selection.split(" ")[0]  # Extract just the "cuda:X" part
+    return "cpu"
+@st.cache_resource
+def load_model(_device):
+    """Load and cache the model to avoid reloading"""
+    global processor, model
+    # Authenticate with Hugging Face
+    if HF_TOKEN:
+        login(token=HF_TOKEN)
+    try:
+        logger.info(f"Loading SmolDocling model on {_device}...")
+        processor = AutoProcessor.from_pretrained(
+            "ds4sd/SmolDocling-256M-preview",
+            cache_dir=CACHE_DIR
+        )
+        model = AutoModelForVision2Seq.from_pretrained(
+            "ds4sd/SmolDocling-256M-preview",
+            torch_dtype=torch.float16 if _device.startswith("cuda") else torch.float32,
+            cache_dir=CACHE_DIR
+        ).to(_device)
+        logger.info("Model loaded successfully")
+        return processor, model
+    except Exception as e:
+        logger.error(f"Error loading model: {str(e)}")
+        raise
+def optimize_image(image, max_size=1600):
+    """Optimize image size while maintaining aspect ratio"""
+    width, height = image.size
+    if max(width, height) > max_size:
+        if width > height:
+            new_width = max_size
+            new_height = int(height * (max_size / width))
+        else:
+            new_height = max_size
+            new_width = int(width * (max_size / height))
+        image = image.resize((new_width, new_height), Image.LANCZOS)
+    return image
+def process_single_image(image, prompt_text="Convert this page to docling.", device="cpu", show_progress=None):
+    """Process a single image"""
+    global processor, model
+    # Optimize image
+    image = optimize_image(image)
+    start_time = time.time()
+    # Load the model if not already loaded
+    processor, model = load_model(device)
+    # Create input messages
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": prompt_text}
+            ]
+        },
+    ]
+    # Prepare inputs
+    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+    inputs = processor(text=prompt, images=[image], return_tensors="pt")
+    inputs = inputs.to(device)
+    # Generate outputs
+    with torch.no_grad():  # Add this to save memory
+        generated_ids = model.generate(
+            **inputs,
+            max_new_tokens=1500,  # Increased for better results
+            do_sample=False,      # Deterministic generation
+            num_beams=1,          # Simple beam search
+            temperature=1.0,      # No temperature scaling
+        )
+    prompt_length = inputs.input_ids.shape[1]
+    trimmed_generated_ids = generated_ids[:, prompt_length:]
+    doctags = processor.batch_decode(
+        trimmed_generated_ids,
+        skip_special_tokens=False,
+    )[0].lstrip()
+    # Clean the output
+    doctags = doctags.replace("<end_of_utterance>", "").strip()
+    # Populate document
+    doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
+    # Create a docling document
+    doc = DoclingDocument(name="Document")
+    doc.load_from_doctags(doctags_doc)
+    # Export as markdown
+    md_content = doc.export_to_markdown()
+    # Export as HTML
+    html_content = doc.export_to_html()
+    # Get plain text
+    plain_text = doc.export_to_text()
+    processing_time = time.time() - start_time
+    return {
+        "doctags": doctags,
+        "markdown": md_content,
+        "html": html_content,
+        "text": plain_text,
+        "processing_time": processing_time
+    }
+def process_batch(images, prompt_text, device, progress_bar=None):
+    """Process a batch of images with progress tracking"""
+    results = []
+    total = len(images)
+    for idx, image in enumerate(images):
+        if progress_bar:
+            progress_bar.progress((idx) / total, text=f"Processing image {idx+1}/{total}")
+        result = process_single_image(image, prompt_text, device)
+        results.append(result)
+        if progress_bar:
+            progress_bar.progress((idx + 1) / total, text=f"Processed {idx+1}/{total} images")
+    return results
+def save_session_history(results):
+    """Save processing results to session history"""
+    if 'history' not in st.session_state:
+        st.session_state.history = []
+    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    for idx, result in enumerate(results):
+        st.session_state.history.append({
+            "id": len(st.session_state.history) + 1,
+            "timestamp": timestamp,
+            "type": "Image " + str(idx + 1),
+            "processing_time": result["processing_time"],
+            "result": result
+        })
+def display_history():
+    """Display session history"""
+    if 'history' not in st.session_state or not st.session_state.history:
+        st.info("No processing history available")
+        return
+    st.subheader("Processing History")
+    for item in reversed(st.session_state.history):
+        with st.expander(f"#{item['id']} - {item['type']} ({item['timestamp']})"):
+            st.write(f"Processing time: {item['processing_time']:.2f} seconds")
+            tabs = st.tabs(["Markdown", "Text", "DocTags", "HTML"])
+            with tabs[0]:
+                st.markdown(item['result']['markdown'])
+                st.download_button(
+                    "Download Markdown",
+                    item['result']['markdown'],
+                    file_name=f"output_{item['id']}.md"
+                )
+            with tabs[1]:
+                st.text_area("Plain Text", item['result']['text'], height=200)
+                st.download_button(
+                    "Download Text",
+                    item['result']['text'],
+                    file_name=f"output_{item['id']}.txt"
+                )
+            with tabs[2]:
+                st.text_area("DocTags", item['result']['doctags'], height=200)
+                st.download_button(
+                    "Download DocTags",
+                    item['result']['doctags'],
+                    file_name=f"output_{item['id']}.dt"
+                )
+            with tabs[3]:
+                st.code(item['result']['html'], language="html")
+                st.download_button(
+                    "Download HTML",
+                    item['result']['html'],
+                    file_name=f"output_{item['id']}.html"
+                )
+def main():
+    # App configuration
+    st.set_page_config(
+        page_title="SmolDocling OCR App",
+        page_icon="📄",
+        layout="wide",
+        initial_sidebar_state="expanded"
+    )
+    # Custom theme
+    st.markdown("""
+    <style>
+    .main-header {
+        font-size: 2.5rem;
+        margin-bottom: 0.5rem;
+    }
+    .sub-header {
+        font-size: 1.2rem;
+        color: #666;
+        margin-bottom: 2rem;
+    }
+    .stTabs [data-baseweb="tab-list"] {
+        gap: 2px;
+    }
+    .stTabs [data-baseweb="tab"] {
+        padding: 10px 16px;
+        background-color: #f0f2f6;
+    }
+    .stTabs [aria-selected="true"] {
+        background-color: #e6f0ff;
+    }
+    </style>
+    """, unsafe_allow_html=True)
+    # App header
+    st.markdown('<p class="main-header">SmolDocling OCR App</p>', unsafe_allow_html=True)
+    st.markdown('<p class="sub-header">Extract text from images using SmolDocling AI</p>', unsafe_allow_html=True)
+    # Check dependencies
+    missing_deps = check_dependencies()
+    if missing_deps:
+        st.error(f"Missing dependencies: {', '.join(missing_deps)}. Please install them to use this app.")
+        st.info("Install with: pip install " + " ".join(missing_deps))
+        st.stop()
+    # Initialize session state
+    if 'results' not in st.session_state:
+        st.session_state.results = []
+    # Create sidebar
+    with st.sidebar:
+        st.header("Configuration")
+        # Device selection
+        st.subheader("Processing Device")
+        available_devices = get_available_devices()
+        selected_device = st.selectbox(
+            "Select processing device",
+            available_devices,
+            index=0 if len(available_devices) == 1 else 1,  # Default to CUDA if available
+            help="Choose the device for model inference. GPU (CUDA) is recommended for faster processing."
+        )
+        device = get_device_from_selection(selected_device)
+        # Model info
+        st.info(f"Selected device: {selected_device}")
+        if device == "cpu":
+            st.warning("⚠️ CPU processing may be slow. Select a GPU device if available for faster performance.")
+        # Memory management
+        if device.startswith("cuda"):
+            with st.expander("GPU Memory Management"):
+                st.write("Current GPU Memory Usage:")
+                if torch.cuda.is_available():
+                    gpu_idx = int(device.split(":")[1]) if ":" in device else 0
+                    allocated = torch.cuda.memory_allocated(gpu_idx) / (1024 ** 3)
+                    reserved = torch.cuda.memory_reserved(gpu_idx) / (1024 ** 3)
+                    st.progress(allocated / (torch.cuda.get_device_properties(gpu_idx).total_memory / (1024 ** 3)))
+                    st.write(f"Allocated: {allocated:.2f} GB")
+                    st.write(f"Reserved: {reserved:.2f} GB")
+                    if st.button("Clear GPU Cache"):
+                        torch.cuda.empty_cache()
+                        st.success("GPU cache cleared")
+        # Upload options
+        st.subheader("Upload Options")
+        upload_option = st.radio("Choose upload option:", ["Single Image", "Multiple Images"])
+        # Advanced options
+        with st.expander("Advanced Options"):
+            task_type = st.selectbox(
+                "Select task type",
+                [
+                    "Convert this page to docling.",
+                    "Convert this table to OTSL.",
+                    "Convert code to text.",
+                    "Convert formula to latex.",
+                    "Convert chart to OTSL.",
+                    "Extract all section header elements on the page."
+                ]
+            )
+            custom_prompt = st.text_area(
+                "Custom prompt (optional)",
+                value="",
+                help="Provide a custom prompt if needed. Leave empty to use the selected task type."
+            )
+            max_image_size = st.slider(
+                "Max image dimension (pixels)",
+                min_value=800,
+                max_value=3200,
+                value=1600,
+                step=100,
+                help="Larger values may improve OCR quality but use more memory"
+            )
+            final_prompt = custom_prompt if custom_prompt else task_type
+        # Upload controls
+        st.subheader("Upload Image(s)")
+        if upload_option == "Single Image":
+            uploaded_file = st.file_uploader("Upload image", type=["jpg", "jpeg", "png", "pdf"])
+            if uploaded_file is not None:
+                try:
+                    image = Image.open(uploaded_file).convert("RGB")
+                    st.image(image, caption="Uploaded Image", width=250)
+                except Exception as e:
+                    st.error(f"Error loading image: {str(e)}")
+        else:
+            uploaded_files = st.file_uploader(
+                "Upload multiple images",
+                type=["jpg", "jpeg", "png"],
+                accept_multiple_files=True
+            )
+            if uploaded_files:
+                st.success(f"{len(uploaded_files)} images uploaded")
+        # Process button
+        if (upload_option == "Single Image" and 'uploaded_file' in locals() and uploaded_file is not None) or \
+           (upload_option == "Multiple Images" and 'uploaded_files' in locals() and uploaded_files):
+            process_button = st.button("Process Image(s)", type="primary")
+        # History button
+        st.subheader("History")
+        if st.button("Show Processing History"):
+            st.session_state.show_history = True
+        # About section
+        with st.expander("About SmolDocling OCR"):
+            st.write("""
+            This app uses SmolDocling, a powerful OCR model for document understanding from Hugging Face Hub.
+            The app extracts DocTags format and converts it to Markdown, HTML, and plain text for easy reading.
+            Available tasks:
+            - Convert pages to DocTags (general OCR)
+            - Convert tables to OTSL
+            - Convert code snippets to text
+            - Convert formulas to LaTeX
+            - Convert charts to OTSL
+            - Extract section headers
+            """)
+    # Main content area
+    if 'show_history' in st.session_state and st.session_state.show_history:
+        display_history()
+        st.session_state.show_history = False
+    elif upload_option == "Single Image" and 'uploaded_file' in locals() and uploaded_file is not None and process_button:
+        with st.spinner("Processing image..."):
+            try:
+                progress_bar = st.progress(0, text="Preparing to process...")
+                # Update global optimization settings
+                optimize_image.func_defaults = (max_image_size,)
+                result = process_single_image(image, final_prompt, device)
+                st.session_state.results = [result]
+                # Save to history
+                save_session_history(st.session_state.results)
+                progress_bar.progress(1.0, text="Processing complete!")
+                # Display results
+                tabs = st.tabs(["Markdown", "Text", "DocTags", "HTML"])
+                with tabs[0]:
+                    st.subheader("Markdown Output")
+                    st.markdown(result["markdown"])
+                    st.download_button(
+                        "Download Markdown",
+                        result["markdown"],
+                        file_name="output.md"
+                    )
+                with tabs[1]:
+                    st.subheader("Plain Text Output")
+                    st.text_area("Extracted Text", result["text"], height=300)
+                    st.download_button(
+                        "Download Text",
+                        result["text"],
+                        file_name="output.txt"
+                    )
+                with tabs[2]:
+                    st.subheader("DocTags Output")
+                    st.text_area("DocTags", result["doctags"], height=300)
+                    st.download_button(
+                        "Download DocTags",
+                        result["doctags"],
+                        file_name="output.dt"
+                    )
+                with tabs[3]:
+                    st.subheader("HTML Output")
+                    st.code(result["html"], language="html")
+                    st.download_button(
+                        "Download HTML",
+                        result["html"],
+                        file_name="output.html"
+                    )
+                st.success(f"Processing completed in {result['processing_time']:.2f} seconds on {selected_device}")
+            except Exception as e:
+                st.error(f"Error processing image: {str(e)}")
+                logger.error(f"Error processing image: {str(e)}", exc_info=True)
+    elif upload_option == "Multiple Images" and 'uploaded_files' in locals() and uploaded_files and process_button:
+        try:
+            images = [Image.open(file).convert("RGB") for file in uploaded_files]
+            if len(images) > 0:
+                with st.spinner(f"Processing {len(images)} images..."):
+                    progress_bar = st.progress(0, text="Preparing to process...")
+                    # Update global optimization settings
+                    optimize_image.func_defaults = (max_image_size,)
+                    results = process_batch(images, final_prompt, device, progress_bar)
+                    st.session_state.results = results
+                    # Save to history
+                    save_session_history(results)
+                    progress_bar.progress(1.0, text="Processing complete!")
+                    # Display results
+                    st.subheader("Processing Results")
+                    total_time = sum(result["processing_time"] for result in results)
+                    avg_time = total_time / len(results)
+                    st.write(f"Total processing time: {total_time:.2f} seconds on {selected_device}")
+                    st.write(f"Average processing time: {avg_time:.2f} seconds per image")
+                    # Create tabs for each image
+                    for idx, (result, image) in enumerate(zip(results, images)):
+                        with st.expander(f"Image {idx+1} Results"):
+                            col1, col2 = st.columns([1, 2])
+                            with col1:
+                                st.image(image, caption=f"Image {idx+1}", width=250)
+                                st.write(f"Processing time: {result['processing_time']:.2f} seconds")
+                            with col2:
+                                inner_tabs = st.tabs(["Markdown", "Text", "DocTags", "HTML"])
+                                with inner_tabs[0]:
+                                    st.markdown(result["markdown"])
+                                    st.download_button(
+                                        f"Download Markdown",
+                                        result["markdown"],
+                                        file_name=f"output_{idx+1}.md"
+                                    )
+                                with inner_tabs[1]:
+                                    st.text_area("Plain Text", result["text"], height=200)
+                                    st.download_button(
+                                        f"Download Text",
+                                        result["text"],
+                                        file_name=f"output_{idx+1}.txt"
+                                    )
+                                with inner_tabs[2]:
+                                    st.text_area("DocTags", result["doctags"], height=200)
+                                    st.download_button(
+                                        f"Download DocTags",
+                                        result["doctags"],
+                                        file_name=f"output_{idx+1}.dt"
+                                    )
+                                with inner_tabs[3]:
+                                    st.code(result["html"], language="html")
+                                    st.download_button(
+                                        f"Download HTML",
+                                        result["html"],
+                                        file_name=f"output_{idx+1}.html"
+                                    )
+                    st.success(f"All images processed successfully")
+        except Exception as e:
+            st.error(f"Error processing images: {str(e)}")
+            logger.error(f"Error processing images: {str(e)}", exc_info=True)
+    # Display a welcome message if no image has been uploaded
+    if ('uploaded_file' not in locals() or uploaded_file is None) and \
+       ('uploaded_files' not in locals() or not uploaded_files):
+        st.info("👈 Upload an image using the sidebar to get started")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+streamlit
+torch
+accelerate
+transformers
+docling-core
+huggingface_hub
+Pillow
+python-dotenv