Spaces:

Deadmon
/

ocr-pdf

Sleeping

App Files Files Community

Deadmon commited on Mar 9

Commit

ab9d843

verified ·

1 Parent(s): c38a729

Update app.py

Browse files

Files changed (1) hide show

app.py +126 -2

app.py CHANGED Viewed

@@ -1,10 +1,134 @@
-import gradio as gr
-from script import process_pdf  # Assuming the above script is saved as script.py
 from pathlib import Path
 OUTPUT_DIR = Path("outputs")
 OUTPUT_DIR.mkdir(exist_ok=True)
 def process_uploaded_pdf(pdf_file):
     if pdf_file is None:
         return "Please upload a PDF file."

+import os
 from pathlib import Path
+import fitz  # PyMuPDF for PDF handling
+from PIL import Image
+import pytesseract  # For OCR
+from transformers import BlipProcessor, BlipForConditionalGeneration  # For image captioning
+import io
+import torch
+import gradio as gr
+# Create output directory
 OUTPUT_DIR = Path("outputs")
 OUTPUT_DIR.mkdir(exist_ok=True)
+def pdf_to_images(pdf_path):
+    """
+    Convert PDF pages to appropriately sized images
+    """
+    try:
+        # Open the PDF
+        pdf_document = fitz.open(pdf_path)
+        images = []
+        for page_num in range(len(pdf_document)):
+            page = pdf_document[page_num]
+            # Get the page dimensions to determine appropriate resolution
+            rect = page.rect
+            width = rect.width
+            height = rect.height
+            # Calculate appropriate zoom factor to get good quality images
+            # Aim for approximately 2000 pixels on the longest side
+            zoom = 2000 / max(width, height)
+            # Create a transformation matrix
+            mat = fitz.Matrix(zoom, zoom)
+            # Render page to an image
+            pix = page.get_pixmap(matrix=mat)
+            # Convert to PIL Image
+            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            # Save image
+            image_path = OUTPUT_DIR / f"page_{page_num + 1}.png"
+            img.save(image_path, "PNG")
+            images.append((image_path, img))
+        pdf_document.close()
+        return images
+    except Exception as e:
+        print(f"Error converting PDF to images: {str(e)}")
+        return []
+def extract_text_from_image(image):
+    """
+    Extract text from an image using OCR
+    """
+    try:
+        text = pytesseract.image_to_string(image)
+        return text.strip()
+    except Exception as e:
+        print(f"Error during OCR: {str(e)}")
+        return ""
+def analyze_image(image_path):
+    """
+    Analyze image content using BLIP model for image captioning
+    """
+    try:
+        # Load BLIP model and processor
+        processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+        model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+        # Load and process image
+        image = Image.open(image_path).convert('RGB')
+        inputs = processor(image, return_tensors="pt")
+        # Generate caption
+        with torch.no_grad():
+            outputs = model.generate(**inputs)
+        caption = processor.decode(outputs[0], skip_special_tokens=True)
+        return caption
+    except Exception as e:
+        print(f"Error during image analysis: {str(e)}")
+        return "Image content could not be analyzed."
+def process_pdf(pdf_path, output_txt_path):
+    """
+    Main function to process the PDF and generate output
+    """
+    # Convert PDF to images
+    print("Converting PDF to images...")
+    images = pdf_to_images(pdf_path)
+    if not images:
+        print("No images were generated from the PDF.")
+        return
+    # Prepare output file
+    with open(output_txt_path, 'w', encoding='utf-8') as f:
+        f.write(f"Analysis of {os.path.basename(pdf_path)}\n")
+        f.write("=" * 50 + "\n\n")
+        # Process each page
+        for page_num, (image_path, image) in enumerate(images, 1):
+            print(f"Processing page {page_num}...")
+            # Write page header
+            f.write(f"Page {page_num}\n")
+            f.write("-" * 30 + "\n\n")
+            # Extract and write text
+            text = extract_text_from_image(image)
+            if text:
+                f.write("Extracted Text:\n")
+                f.write(text)
+                f.write("\n\n")
+            else:
+                f.write("No text could be extracted from this page.\n\n")
+            # Analyze image and write description
+            description = analyze_image(image_path)
+            f.write("Image Description:\n")
+            f.write(f"{description}\n")
+            f.write("\n" + "=" * 50 + "\n\n")
+    print(f"Processing complete. Results saved to {output_txt_path}")
 def process_uploaded_pdf(pdf_file):
     if pdf_file is None:
         return "Please upload a PDF file."