Spaces:

mgbam
/

builder

Sleeping

App Files Files Community

mgbam commited on Jul 18

Commit

9686c37

verified ·

1 Parent(s): 4739b8c

Update extractor.py

Browse files

Files changed (1) hide show

extractor.py +22 -76

extractor.py CHANGED Viewed

@@ -1,109 +1,55 @@
 # /extractor.py
-"""
-Handles content extraction from various sources like files, images, and websites.
-This module encapsulates the logic for parsing different file formats (PDF, DOCX),
-performing Optical Character Recognition (OCR) on images, and scraping web content.
-"""
-import mimetypes
-import os
-import re
-from urllib.parse import urlparse, urljoin
-import logging
-import PyPDF2
-import docx
-import requests
 from bs4 import BeautifulSoup
-# --- Setup Logging ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-# --- Optional OCR Imports ---
 try:
-    import cv2
-    import numpy as np
-    import pytesseract
     OCR_AVAILABLE = True
 except ImportError:
     OCR_AVAILABLE = False
-    logging.warning("OCR libraries not found (cv2, numpy, pytesseract). Text extraction from images will be disabled.")
 def extract_text_from_image(image_path: str) -> str:
-    """Extracts text from an image file using Tesseract OCR."""
-    if not OCR_AVAILABLE:
-        return "Error: OCR dependencies are not installed. Please run 'pip install opencv-python-headless pytesseract'."
-    try:
-        pytesseract.get_tesseract_version()
-    except Exception:
-        return "Error: Tesseract OCR is not installed or not in your PATH."
     try:
         image = cv2.imread(image_path)
-        if image is None:
-            return "Error: Could not read image file."
         gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-        text = pytesseract.image_to_string(gray)
-        return text.strip() or "No text found in image."
-    except Exception as e:
-        logging.error(f"OCR extraction failed: {e}")
-        return f"Error during OCR: {e}"
 def extract_text_from_file(file_path: str) -> str:
-    """Extracts text from a variety of file types."""
-    if not file_path:
-        return ""
     ext = os.path.splitext(file_path)[1].lower()
     try:
         if ext == ".pdf":
-            with open(file_path, "rb") as f:
-                reader = PyPDF2.PdfReader(f)
-                return "\n".join(page.extract_text() or "" for page in reader.pages)
         elif ext == ".docx":
-            doc = docx.Document(file_path)
-            return "\n".join(p.text for p in doc.paragraphs)
         elif ext in [".txt", ".md", ".csv", ".html", ".css", ".js", ".py"]:
-            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
-                return f.read()
         elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff"]:
             return extract_text_from_image(file_path)
-        else:
-            return f"Unsupported file type: {ext}"
-    except Exception as e:
-        logging.error(f"Error extracting text from {file_path}: {e}")
-        return f"Error extracting text: {e}"
 def extract_website_content(url: str) -> str:
-    """Scrapes and returns the primary HTML content of a given URL."""
     try:
-        headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
-        }
         response = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
         response.raise_for_status()
         response.encoding = response.apparent_encoding
         soup = BeautifulSoup(response.text, 'html.parser')
-        # Make all resource links absolute
         for tag, attr in [('img', 'src'), ('link', 'href'), ('script', 'src')]:
             for item in soup.find_all(tag):
-                if item.has_attr(attr):
-                    item[attr] = urljoin(url, item[attr])
-        title = soup.title.string if soup.title else "N/A"
-        # Return a prettified version of the body content for context
-        body_content = soup.body.prettify() if soup.body else str(soup)
-        # Truncate for prompt
-        if len(body_content) > 15000:
-             body_content = body_content[:15000] + "\n<!-- ... HTML truncated ... -->"
-        return f"<!-- Original URL: {url} -->\n<!-- Title: {title} -->\n{body_content}"
-    except requests.RequestException as e:
-        logging.error(f"Website extraction failed for {url}: {e}")
-        return f"Error: Could not fetch content from the URL. Details: {e}"
-    except Exception as e:
-        logging.error(f"An unexpected error occurred during website extraction: {e}")
-        return f"Error: An unexpected error occurred. Details: {e}"

 # /extractor.py
+""" Handles content extraction from various sources like files, images, and websites. """
+import mimetypes, os, re, logging
+from urllib.parse import urljoin
+import PyPDF2, docx, requests
 from bs4 import BeautifulSoup
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 try:
+    import cv2, pytesseract
     OCR_AVAILABLE = True
 except ImportError:
     OCR_AVAILABLE = False
+    logging.warning("OCR libraries not found. Text extraction from images will be disabled.")
 def extract_text_from_image(image_path: str) -> str:
+    if not OCR_AVAILABLE: return "Error: OCR dependencies not installed."
     try:
         image = cv2.imread(image_path)
         gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        return pytesseract.image_to_string(gray) or "No text found in image."
+    except Exception as e: return f"Error during OCR: {e}"
 def extract_text_from_file(file_path: str) -> str:
+    if not file_path: return ""
     ext = os.path.splitext(file_path)[1].lower()
     try:
         if ext == ".pdf":
+            with open(file_path, "rb") as f: return "\n".join(p.extract_text() or "" for p in PyPDF2.PdfReader(f).pages)
         elif ext == ".docx":
+            return "\n".join(p.text for p in docx.Document(file_path).paragraphs)
         elif ext in [".txt", ".md", ".csv", ".html", ".css", ".js", ".py"]:
+            with open(file_path, "r", encoding="utf-8", errors="ignore") as f: return f.read()
         elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff"]:
             return extract_text_from_image(file_path)
+        else: return f"Unsupported file type: {ext}"
+    except Exception as e: return f"Error extracting text: {e}"
 def extract_website_content(url: str) -> str:
     try:
+        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
         response = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
         response.raise_for_status()
         response.encoding = response.apparent_encoding
         soup = BeautifulSoup(response.text, 'html.parser')
         for tag, attr in [('img', 'src'), ('link', 'href'), ('script', 'src')]:
             for item in soup.find_all(tag):
+                if item.has_attr(attr): item[attr] = urljoin(url, item[attr])
+        body_content = str(soup)
+        if len(body_content) > 15000: body_content = body_content[:15000] + "\n<!-- ... HTML truncated ... -->"
+        return f"<!-- Original URL: {url} -->\n{body_content}"
+    except Exception as e: return f"Error: Could not fetch content from {url}. Details: {e}"