Yaz Hobooti commited on Sep 4

Commit

e7a28e8

0 Parent(s):

Increase PDF resolution: DPI from 300 to 600, scaling factors improved for better OCR and barcode detection

Files changed (21) hide show

ProofCheck/.dockerignore +65 -0
ProofCheck/.gitattributes +35 -0
ProofCheck/Dockerfile +46 -0
ProofCheck/README.md +117 -0
ProofCheck/app.py +99 -0
ProofCheck/pdf_comparator.py +1938 -0
ProofCheck/requirements.txt +20 -0
ProofCheck/run.py +123 -0
ProofCheck/static/css/style.css +324 -0
ProofCheck/static/js/script.js +353 -0
ProofCheck/templates/index.html +154 -0
ProofCheck/test_setup.py +133 -0
README.md +203 -0
app.py +97 -0
pdf_comparator.py +551 -0
requirements.txt +16 -0
run.py +123 -0
static/css/style.css +228 -0
static/js/script.js +242 -0
templates/index.html +142 -0
test_setup.py +133 -0

ProofCheck/.dockerignore ADDED Viewed

	@@ -0,0 +1,65 @@

+# Git
+.git
+.gitignore
+.gitattributes
+# Python
+__pycache__
+*.pyc
+*.pyo
+*.pyd
+.Python
+env
+pip-log.txt
+pip-delete-this-directory.txt
+.tox
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.log
+.git
+.mypy_cache
+.pytest_cache
+.hypothesis
+# Virtual environments
+venv/
+ENV/
+env/
+.venv/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+# Temporary files
+*.tmp
+*.temp
+uploads/
+results/
+static/results/
+# Documentation
+README.md
+*.md
+docs/
+# Test files
+test_*.py
+*_test.py
+tests/

ProofCheck/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

ProofCheck/Dockerfile ADDED Viewed

	@@ -0,0 +1,46 @@

+FROM python:3.9-slim
+# Set working directory
+WORKDIR /app
+# Install system dependencies including Tesseract OCR and zbar
+RUN apt-get update && apt-get install -y \
+    tesseract-ocr \
+    tesseract-ocr-eng \
+    tesseract-ocr-fra \
+    poppler-utils \
+    libzbar0 \
+    libgl1 \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender1 \
+    libgomp1 \
+    libgthread-2.0-0 \
+    libfontconfig1 \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first for better caching
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Download NLTK data
+RUN python -c "import nltk; nltk.download('punkt')"
+# Copy application files
+COPY . .
+# Create necessary directories
+RUN mkdir -p uploads results static/results
+# Expose port
+EXPOSE 7860
+# Set environment variables
+ENV PYTHONPATH=/app
+ENV FLASK_APP=app.py
+# Run the application
+CMD ["python", "app.py"]

ProofCheck/README.md ADDED Viewed

	@@ -0,0 +1,117 @@

+---
+title: PDF Comparison Tool
+emoji: 📄
+colorFrom: blue
+colorTo: purple
+sdk: docker
+pinned: false
+license: mit
+---
+# PDF Comparison Tool
+A comprehensive web-based tool for comparing PDF documents with advanced features including OCR validation, color difference detection, spelling verification, and barcode/QR code detection.
+## 🚀 Live Demo
+This tool is deployed on Hugging Face Spaces and available for immediate use!
+## ✨ Features
+- **PDF Validation**: Ensures uploaded PDFs contain "50 Carroll" using OCR
+- **Color Difference Detection**: Identifies visual differences between PDFs and highlights them with red boxes
+- **Spelling Verification**: Checks text against both English and French dictionaries
+- **Barcode/QR Code Detection**: Automatically detects and reads barcodes and QR codes
+- **Visual Comparison**: Side-by-side comparison with annotated differences
+- **Modern Web Interface**: Responsive design with Bootstrap and custom styling
+## 📋 Requirements
+- Both PDF files must contain the text "50 Carroll" for validation
+- Maximum file size: 16MB per PDF
+- Supported format: PDF only
+## 🎯 How to Use
+1. **Upload PDFs**: Select two PDF files for comparison
+2. **Validation**: The tool automatically checks for "50 Carroll" in both documents
+3. **Processing**: Wait for the analysis to complete (may take a few minutes)
+4. **Results**: View findings in three organized tabs:
+   - **Visual Comparison**: Side-by-side view with red boxes highlighting differences
+   - **Spelling Issues**: Table of spelling errors with suggestions from English and French dictionaries
+   - **Barcodes & QR Codes**: List of detected barcodes with their data and positions
+## 🔧 Technical Details
+### Backend Technologies
+- **Python Flask**: Web framework
+- **OpenCV**: Image processing and comparison
+- **Tesseract OCR**: Text extraction from PDFs
+- **scikit-image**: Structural similarity analysis
+- **pyspellchecker**: Spelling verification
+- **pyzbar**: Barcode and QR code detection
+### Frontend Technologies
+- **HTML5/CSS3**: Modern responsive design
+- **JavaScript**: Dynamic content and AJAX requests
+- **Bootstrap**: UI framework for professional appearance
+### Comparison Algorithms
+- **Color Difference**: Uses Structural Similarity Index (SSIM) for pixel-level comparison
+- **Text Analysis**: OCR-based text extraction with multi-language spell checking
+- **Barcode Detection**: Automatic recognition of various barcode and QR code formats
+## 🛠️ Local Development
+If you want to run this tool locally:
+```bash
+# Clone the repository
+git clone https://huggingface.co/spaces/Digitaljoint/ProofCheck
+# Install dependencies
+pip install -r requirements.txt
+# Install Tesseract OCR
+# macOS: brew install tesseract
+# Ubuntu: sudo apt-get install tesseract-ocr
+# Run the application
+python app.py
+```
+## 📊 Output Examples
+### Visual Comparison
+- Red rectangles highlight color differences between PDFs
+- Side-by-side view for easy comparison
+- Page-by-page analysis
+### Spelling Issues
+- Word-by-word analysis against English and French dictionaries
+- Spelling suggestions for both languages
+- Organized table format with original text and corrections
+### Barcode/QR Code Detection
+- Automatic detection of various barcode formats
+- Extracted data display
+- Position information for each detected code
+## 🔒 Privacy & Security
+- All processing happens locally on the server
+- No data is stored permanently
+- Files are automatically cleaned up after processing
+- No external API calls or data sharing
+## 🤝 Contributing
+This tool is open source and contributions are welcome! Please feel free to submit issues or pull requests.
+## 📄 License
+This project is available under the MIT License.
+---
+**Note**: This tool is specifically designed to validate PDFs containing "50 Carroll" and will reject files that don't contain this text. This ensures that only relevant documents are processed for comparison.

ProofCheck/app.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import os
+import uuid
+import json
+from flask import Flask, request, render_template, jsonify, send_file
+from werkzeug.utils import secure_filename
+from pdf_comparator import PDFComparator
+import tempfile
+import shutil
+app = Flask(__name__)
+app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024  # 16MB max file size
+app.config['UPLOAD_FOLDER'] = 'uploads'
+app.config['RESULTS_FOLDER'] = 'results'
+# Ensure directories exist
+os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
+os.makedirs(app.config['RESULTS_FOLDER'], exist_ok=True)
+os.makedirs('static/results', exist_ok=True)
+ALLOWED_EXTENSIONS = {'pdf'}
+def allowed_file(filename):
+    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+@app.route('/')
+def index():
+    return render_template('index.html')
+@app.route('/upload', methods=['POST'])
+def upload_files():
+    if 'pdf1' not in request.files or 'pdf2' not in request.files:
+        return jsonify({'error': 'Both PDF files are required'}), 400
+    pdf1 = request.files['pdf1']
+    pdf2 = request.files['pdf2']
+    if pdf1.filename == '' or pdf2.filename == '':
+        return jsonify({'error': 'Both PDF files are required'}), 400
+    if not (allowed_file(pdf1.filename) and allowed_file(pdf2.filename)):
+        return jsonify({'error': 'Only PDF files are allowed'}), 400
+    # Create unique session directory
+    session_id = str(uuid.uuid4())
+    session_dir = os.path.join(app.config['UPLOAD_FOLDER'], session_id)
+    os.makedirs(session_dir, exist_ok=True)
+    # Save uploaded files
+    pdf1_path = os.path.join(session_dir, secure_filename(pdf1.filename))
+    pdf2_path = os.path.join(session_dir, secure_filename(pdf2.filename))
+    pdf1.save(pdf1_path)
+    pdf2.save(pdf2_path)
+    try:
+        # Initialize PDF comparator
+        comparator = PDFComparator()
+        # Perform comparison
+        results = comparator.compare_pdfs(pdf1_path, pdf2_path, session_id)
+        # Save results
+        results_path = os.path.join(app.config['RESULTS_FOLDER'], f'{session_id}_results.json')
+        with open(results_path, 'w') as f:
+            json.dump(results, f, indent=2)
+        return jsonify({
+            'success': True,
+            'session_id': session_id,
+            'results': results
+        })
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+@app.route('/results/<session_id>')
+def get_results(session_id):
+    results_path = os.path.join(app.config['RESULTS_FOLDER'], f'{session_id}_results.json')
+    if not os.path.exists(results_path):
+        return jsonify({'error': 'Results not found'}), 404
+    with open(results_path, 'r') as f:
+        results = json.load(f)
+    return jsonify(results)
+@app.route('/download/<session_id>/<filename>')
+def download_file(session_id, filename):
+    file_path = os.path.join(app.config['UPLOAD_FOLDER'], session_id, filename)
+    if not os.path.exists(file_path):
+        return jsonify({'error': 'File not found'}), 404
+    return send_file(file_path, as_attachment=True)
+# For Hugging Face Spaces deployment
+if __name__ == '__main__':
+    app.run(debug=True, host='0.0.0.0', port=7860)

ProofCheck/pdf_comparator.py ADDED Viewed

	@@ -0,0 +1,1938 @@

+import os
+import cv2
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+import pytesseract
+from pdf2image import convert_from_path
+from pyzbar.pyzbar import decode
+from spellchecker import SpellChecker
+import nltk
+from skimage.metrics import structural_similarity as ssim
+from skimage import color
+import json
+import tempfile
+import shutil
+import re
+import time
+import signal
+import unicodedata
+# Safe import for regex with fallback
+try:
+    import regex as _re
+    _USE_REGEX = True
+except ImportError:
+    import re as _re
+    _USE_REGEX = False
+TOKEN_PATTERN = r"(?:\p{L})(?:[\p{L}'-]{1,})" if _USE_REGEX else r"[A-Za-z][A-Za-z'-]{1,}"
+# Domain whitelist for spell checking
+DOMAIN_WHITELIST = {
+    # units / abbreviations
+    "mg", "mg/g", "ml", "g", "thc", "cbd", "tcm", "mct",
+    # common packaging terms / bilingual words you expect
+    "gouttes", "tennir", "net", "zoom", "tytann", "dome", "drops",
+    # brand or proper names you want to ignore completely
+    "purified", "brands", "tytann", "dome", "drops",
+}
+# lowercase everything in whitelist for comparisons
+DOMAIN_WHITELIST = {w.lower() for w in DOMAIN_WHITELIST}
+def _likely_french(token: str) -> bool:
+    """Helper: quick language guess per token"""
+    if _USE_REGEX:
+        # any Latin letter outside ASCII => probably FR (é, è, ç…)
+        return bool(_re.search(r"[\p{Letter}&&\p{Latin}&&[^A-Za-z]]", token))
+    # fallback: any non-ascii letter
+    return any((not ('a' <= c.lower() <= 'z')) and c.isalpha() for c in token)
+# Try to import additional barcode libraries
+try:
+    import zxing
+    ZXING_AVAILABLE = True
+except ImportError:
+    ZXING_AVAILABLE = False
+    print("zxing-cpp not available, using pyzbar only")
+try:
+    from dbr import BarcodeReader
+    DBR_AVAILABLE = True
+    print("Dynamsoft Barcode Reader available")
+except ImportError:
+    DBR_AVAILABLE = False
+    print("Dynamsoft Barcode Reader not available")
+class TimeoutError(Exception):
+    pass
+def timeout_handler(signum, frame):
+    raise TimeoutError("Operation timed out")
+class PDFComparator:
+    def __init__(self):
+        # Initialize spell checkers for English and French
+        self.english_spellchecker = SpellChecker(language='en')
+        self.french_spellchecker = SpellChecker(language='fr')
+        # Add domain whitelist words to spell checkers
+        for w in DOMAIN_WHITELIST:
+            self.english_spellchecker.word_frequency.add(w)
+            self.french_spellchecker.word_frequency.add(w)
+        # Download required NLTK data
+        try:
+            nltk.data.find('tokenizers/punkt')
+        except LookupError:
+            nltk.download('punkt')
+    def safe_execute(self, func, *args, timeout=30, **kwargs):
+        """Execute a function with timeout protection"""
+        try:
+            # Set timeout signal
+            signal.signal(signal.SIGALRM, timeout_handler)
+            signal.alarm(timeout)
+            # Execute function
+            result = func(*args, **kwargs)
+            # Cancel timeout
+            signal.alarm(0)
+            return result
+        except TimeoutError:
+            print(f"Function {func.__name__} timed out after {timeout} seconds")
+            return None
+        except Exception as e:
+            print(f"Error in {func.__name__}: {str(e)}")
+            return None
+        finally:
+            signal.alarm(0)
+    def validate_pdf(self, pdf_path):
+        """Validate that PDF contains '50 Carroll' using enhanced OCR for tiny fonts"""
+        try:
+            print(f"Validating PDF: {pdf_path}")
+            # Try multiple DPI settings for better tiny font detection
+            dpi_settings = [300, 400, 600, 800]
+            for dpi in dpi_settings:
+                print(f"Trying DPI {dpi} for tiny font detection...")
+                # Convert PDF to images with current DPI
+                images = convert_from_path(pdf_path, dpi=dpi)
+                print(f"Converted PDF to {len(images)} images at {dpi} DPI")
+                for page_num, image in enumerate(images):
+                    print(f"Processing page {page_num + 1} at {dpi} DPI...")
+                    # Convert PIL image to OpenCV format
+                    opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+                    # Enhanced preprocessing for tiny fonts
+                    processed_image = self.enhance_image_for_tiny_fonts(opencv_image)
+                    # Try multiple OCR configurations
+                    ocr_configs = [
+                        '--oem 3 --psm 6',  # Assume uniform block of text
+                        '--oem 3 --psm 8',  # Single word
+                        '--oem 3 --psm 13', # Raw line
+                        '--oem 1 --psm 6',  # Legacy engine
+                        '--oem 3 --psm 3',  # Fully automatic page segmentation
+                    ]
+                    for config in ocr_configs:
+                        try:
+                            # Perform OCR with current configuration
+                            text = pytesseract.image_to_string(processed_image, config=config)
+                            # Debug: Show first 300 characters of extracted text
+                            debug_text = text[:300].replace('\n', ' ').replace('\r', ' ')
+                            print(f"Page {page_num + 1} text (DPI {dpi}, config: {config}): '{debug_text}...'")
+                            # Check for "50 Carroll" with various patterns
+                            patterns = ["50 Carroll", "50 carroll", "50Carroll", "50carroll", "50 Carroll", "50 carroll"]
+                            for pattern in patterns:
+                                if pattern in text or pattern.lower() in text.lower():
+                                    print(f"Found '{pattern}' in page {page_num + 1} (DPI {dpi}, config: {config})")
+                                    return True
+                        except Exception as ocr_error:
+                            print(f"OCR error with config {config}: {str(ocr_error)}")
+                            continue
+            print("Validation failed: '50 Carroll' not found in any page with any DPI or OCR config")
+            return False
+        except Exception as e:
+            print(f"Error validating PDF: {str(e)}")
+            raise Exception(f"Error validating PDF: {str(e)}")
+    def enhance_image_for_tiny_fonts(self, image):
+        """Enhance image specifically for tiny font OCR"""
+        try:
+            # Convert to grayscale
+            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+            # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
+            clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
+            enhanced = clahe.apply(gray)
+            # Apply bilateral filter to reduce noise while preserving edges
+            denoised = cv2.bilateralFilter(enhanced, 9, 75, 75)
+            # Apply unsharp masking to enhance edges
+            gaussian = cv2.GaussianBlur(denoised, (0, 0), 2.0)
+            unsharp_mask = cv2.addWeighted(denoised, 1.5, gaussian, -0.5, 0)
+            # Apply adaptive thresholding
+            thresh = cv2.adaptiveThreshold(unsharp_mask, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
+            # Apply morphological operations to clean up
+            kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
+            cleaned = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
+            return cleaned
+        except Exception as e:
+            print(f"Error enhancing image for tiny fonts: {str(e)}")
+            return image
+    def extract_text_from_pdf(self, pdf_path):
+        """Extract text from PDF with multi-color text detection."""
+        try:
+            # Try to extract embedded text first
+            embedded_text = ""
+            try:
+                import fitz  # PyMuPDF
+                doc = fitz.open(pdf_path)
+                all_text = []
+                any_text = False
+                for i, page in enumerate(doc):
+                    t = page.get_text()
+                    any_text |= bool(t.strip())
+                    all_text.append({"page": i+1, "text": t, "image": None})
+                doc.close()
+                if any_text:
+                    # render images for color diff/barcode only when needed
+                    images = convert_from_path(pdf_path, dpi=600)
+                    for d, im in zip(all_text, images):
+                        d["image"] = im
+                    return all_text
+            except Exception:
+                pass
+            # Enhanced OCR path with multi-color text detection
+            print("Extracting text with multi-color detection...")
+            images = convert_from_path(pdf_path, dpi=600)
+            all_text = []
+            for page_num, image in enumerate(images):
+                opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+                # Multi-color text extraction
+                combined_text = self.extract_multi_color_text(opencv_image)
+                all_text.append({
+                    'page': page_num + 1,
+                    'text': combined_text,
+                    'image': image
+                })
+            return all_text
+        except Exception as e:
+            raise Exception(f"Error extracting text from PDF: {str(e)}")
+    def extract_multi_color_text(self, image):
+        """Extract text from image in various colors using multiple preprocessing methods."""
+        try:
+            combined_text = ""
+            # Method 1: Standard black text detection
+            print("Method 1: Standard black text detection")
+            processed_image = self.enhance_image_for_tiny_fonts(image)
+            text1 = self.ocr_with_multiple_configs(processed_image)
+            combined_text += text1 + " "
+            # Method 2: Inverted text detection (for white text on dark background)
+            print("Method 2: Inverted text detection")
+            inverted_image = self.create_inverted_image(image)
+            text2 = self.ocr_with_multiple_configs(inverted_image)
+            combined_text += text2 + " "
+            # Method 3: Color channel separation for colored text
+            print("Method 3: Color channel separation")
+            for channel_name, channel_image in self.extract_color_channels(image):
+                text3 = self.ocr_with_multiple_configs(channel_image)
+                combined_text += text3 + " "
+            # Method 4: Edge-based text detection
+            print("Method 4: Edge-based text detection")
+            edge_image = self.create_edge_enhanced_image(image)
+            text4 = self.ocr_with_multiple_configs(edge_image)
+            combined_text += text4 + " "
+            return combined_text.strip()
+        except Exception as e:
+            print(f"Error in multi-color text extraction: {str(e)}")
+            return ""
+    def create_inverted_image(self, image):
+        """Create inverted image for white text detection."""
+        try:
+            # Convert to grayscale
+            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+            # Invert the image
+            inverted = cv2.bitwise_not(gray)
+            # Apply CLAHE for better contrast
+            clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
+            enhanced = clahe.apply(inverted)
+            # Apply thresholding
+            _, thresh = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+            return thresh
+        except Exception as e:
+            print(f"Error creating inverted image: {str(e)}")
+            return image
+    def extract_color_channels(self, image):
+        """Extract individual color channels for colored text detection."""
+        try:
+            channels = []
+            # Convert to different color spaces
+            hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
+            lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
+            # Extract individual channels
+            b, g, r = cv2.split(image)
+            h, s, v = cv2.split(hsv)
+            l, a, b_lab = cv2.split(lab)
+            # Create channel images for OCR
+            channel_images = [
+                ("blue", b),
+                ("green", g),
+                ("red", r),
+                ("hue", h),
+                ("saturation", s),
+                ("value", v),
+                ("lightness", l)
+            ]
+            for name, channel in channel_images:
+                # Apply thresholding to each channel
+                _, thresh = cv2.threshold(channel, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+                channels.append((name, thresh))
+            return channels
+        except Exception as e:
+            print(f"Error extracting color channels: {str(e)}")
+            return []
+    def create_edge_enhanced_image(self, image):
+        """Create edge-enhanced image for text detection."""
+        try:
+            # Convert to grayscale
+            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+            # Apply edge detection
+            edges = cv2.Canny(gray, 50, 150)
+            # Dilate edges to connect text components
+            kernel = np.ones((2, 2), np.uint8)
+            dilated = cv2.dilate(edges, kernel, iterations=1)
+            # Invert to get white text on black background
+            inverted = cv2.bitwise_not(dilated)
+            return inverted
+        except Exception as e:
+            print(f"Error creating edge-enhanced image: {str(e)}")
+            return image
+    def ocr_with_multiple_configs(self, image):
+        """Perform OCR with multiple configurations."""
+        try:
+            ocr_configs = [
+                '--oem 3 --psm 6',  # Assume uniform block of text
+                '--oem 3 --psm 8',  # Single word
+                '--oem 3 --psm 13', # Raw line
+                '--oem 1 --psm 6',  # Legacy engine
+            ]
+            best_text = ""
+            for config in ocr_configs:
+                try:
+                    text = pytesseract.image_to_string(image, config=config)
+                    if len(text.strip()) > len(best_text.strip()):
+                        best_text = text
+                except Exception as ocr_error:
+                    print(f"OCR error with config {config}: {str(ocr_error)}")
+                    continue
+            return best_text
+        except Exception as e:
+            print(f"Error in OCR with multiple configs: {str(e)}")
+            return ""
+    def annotate_spelling_errors_on_image(self, pil_image, misspelled):
+        """
+        Draw one red rectangle around each misspelled token using Tesseract word boxes.
+        'misspelled' must be a list of dicts with 'word' keys (from check_spelling).
+        """
+        if not misspelled:
+            return pil_image
+        def _norm(s: str) -> str:
+            return unicodedata.normalize("NFKC", s).replace("'","'").strip(".,:;!?)(").lower()
+        # build a quick lookup of misspelled lowercase words
+        miss_set = {_norm(m["word"]) for m in misspelled}
+        # run word-level OCR to get boxes
+        img = pil_image
+        try:
+            data = pytesseract.image_to_data(
+                img,
+                lang="eng+fra",
+                config="--oem 3 --psm 6",
+                output_type=pytesseract.Output.DICT,
+            )
+        except Exception as e:
+            print("image_to_data failed:", e)
+            return img
+        draw = ImageDraw.Draw(img)
+        n = len(data.get("text", []))
+        for i in range(n):
+            word = (data["text"][i] or "").strip()
+            if not word:
+                continue
+            clean = _norm(word)
+            if clean and clean in miss_set:
+                x, y, w, h = data["left"][i], data["top"][i], data["width"][i], data["height"][i]
+                # draw a distinct box for this one word
+                draw.rectangle([x, y, x + w, y + h], outline="red", width=4)
+        return img
+    def detect_barcodes_qr_codes(self, image):
+        """Detect and decode barcodes and QR codes with timeout protection"""
+        try:
+            print("Starting barcode detection...")
+            start_time = time.time()
+            # Convert PIL image to OpenCV format
+            opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+            all_barcodes = []
+            # Method 1: Basic pyzbar detection (fastest)
+            print("Method 1: Basic pyzbar detection")
+            pyzbar_results = self.detect_with_pyzbar_basic(opencv_image)
+            if pyzbar_results:
+                all_barcodes.extend(pyzbar_results)
+                print(f"Found {len(pyzbar_results)} barcodes with basic pyzbar")
+            # Method 2: Dynamsoft Barcode Reader (if available)
+            if DBR_AVAILABLE:
+                print("Method 2: Dynamsoft Barcode Reader")
+                dbr_results = self.detect_with_dynamsoft(opencv_image)
+                if dbr_results:
+                    all_barcodes.extend(dbr_results)
+                    print(f"Found {len(dbr_results)} barcodes with Dynamsoft")
+            # Method 3: Enhanced preprocessing (always run for better detection)
+            print("Method 3: Enhanced preprocessing")
+            enhanced_results = self.detect_with_enhanced_preprocessing(opencv_image)
+            if enhanced_results:
+                all_barcodes.extend(enhanced_results)
+                print(f"Found {len(enhanced_results)} additional barcodes with enhanced preprocessing")
+            # Method 4: Small barcode detection (always run for better detection)
+            print("Method 4: Small barcode detection")
+            small_results = self.detect_small_barcodes_simple(opencv_image)
+            if small_results:
+                all_barcodes.extend(small_results)
+                print(f"Found {len(small_results)} additional small barcodes")
+            # Remove duplicates
+            unique_barcodes = self.remove_duplicate_barcodes(all_barcodes)
+            # Enhance results
+            enhanced_barcodes = self.enhance_barcode_data(unique_barcodes)
+            elapsed_time = time.time() - start_time
+            print(f"Barcode detection completed in {elapsed_time:.2f} seconds. Found {len(enhanced_barcodes)} unique barcodes.")
+            return enhanced_barcodes
+        except Exception as e:
+            print(f"Error in barcode detection: {str(e)}")
+            return []
+    def detect_with_pyzbar_basic(self, image):
+        """Basic pyzbar detection without complex preprocessing"""
+        results = []
+        try:
+            # Simple grayscale conversion
+            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+            # Try original image
+            decoded_objects = decode(gray)
+            for obj in decoded_objects:
+                barcode_info = {
+                    'type': obj.type,
+                    'data': obj.data.decode('utf-8', errors='ignore'),
+                    'rect': obj.rect,
+                    'polygon': obj.polygon,
+                    'quality': getattr(obj, 'quality', 0),
+                    'orientation': self.detect_barcode_orientation(obj),
+                    'method': 'pyzbar_basic'
+                }
+                if 'databar' in obj.type.lower():
+                    barcode_info['expanded_data'] = self.parse_databar_expanded(obj.data.decode('utf-8', errors='ignore'))
+                results.append(barcode_info)
+            # Try with simple contrast enhancement
+            clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+            enhanced = clahe.apply(gray)
+            decoded_objects = decode(enhanced)
+            for obj in decoded_objects:
+                barcode_info = {
+                    'type': obj.type,
+                    'data': obj.data.decode('utf-8', errors='ignore'),
+                    'rect': obj.rect,
+                    'polygon': obj.polygon,
+                    'quality': getattr(obj, 'quality', 0),
+                    'orientation': self.detect_barcode_orientation(obj),
+                    'method': 'pyzbar_enhanced'
+                }
+                if 'databar' in obj.type.lower():
+                    barcode_info['expanded_data'] = self.parse_databar_expanded(obj.data.decode('utf-8', errors='ignore'))
+                results.append(barcode_info)
+        except Exception as e:
+            print(f"Error in basic pyzbar detection: {str(e)}")
+        return results
+    def detect_with_dynamsoft(self, image):
+        """Detect barcodes using Dynamsoft Barcode Reader"""
+        results = []
+        try:
+            if not DBR_AVAILABLE:
+                return results
+            # Initialize Dynamsoft Barcode Reader
+            reader = BarcodeReader()
+            # Convert OpenCV image to bytes for Dynamsoft
+            success, buffer = cv2.imencode('.png', image)
+            if not success:
+                print("Failed to encode image for Dynamsoft")
+                return results
+            image_bytes = buffer.tobytes()
+            # Decode barcodes
+            text_results = reader.decode_file_stream(image_bytes)
+            for result in text_results:
+                barcode_info = {
+                    'type': result.barcode_format_string,
+                    'data': result.barcode_text,
+                    'rect': type('Rect', (), {
+                        'left': result.localization_result.x1,
+                        'top': result.localization_result.y1,
+                        'width': result.localization_result.x2 - result.localization_result.x1,
+                        'height': result.localization_result.y2 - result.localization_result.y1
+                    })(),
+                    'polygon': [
+                        (result.localization_result.x1, result.localization_result.y1),
+                        (result.localization_result.x2, result.localization_result.y1),
+                        (result.localization_result.x2, result.localization_result.y2),
+                        (result.localization_result.x1, result.localization_result.y2)
+                    ],
+                    'quality': result.confidence,
+                    'orientation': self.detect_barcode_orientation(result),
+                    'method': 'dynamsoft'
+                }
+                # Enhanced DataBar Expanded detection
+                if 'databar' in result.barcode_format_string.lower() or 'expanded' in result.barcode_format_string.lower():
+                    barcode_info['expanded_data'] = self.parse_databar_expanded(result.barcode_text)
+                results.append(barcode_info)
+            print(f"Dynamsoft detected {len(results)} barcodes")
+        except Exception as e:
+            print(f"Error in Dynamsoft detection: {str(e)}")
+        return results
+    def detect_with_enhanced_preprocessing(self, image):
+        """Enhanced preprocessing with limited methods"""
+        results = []
+        try:
+            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+            # Limited preprocessing methods
+            processed_images = [
+                gray,  # Original
+                cv2.resize(gray, (gray.shape[1] * 3, gray.shape[0] * 3), interpolation=cv2.INTER_CUBIC),  # 3x scale
+                cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2),  # Adaptive threshold
+            ]
+            for i, processed_image in enumerate(processed_images):
+                try:
+                    decoded_objects = decode(processed_image)
+                    for obj in decoded_objects:
+                        barcode_info = {
+                            'type': obj.type,
+                            'data': obj.data.decode('utf-8', errors='ignore'),
+                            'rect': obj.rect,
+                            'polygon': obj.polygon,
+                            'quality': getattr(obj, 'quality', 0),
+                            'orientation': self.detect_barcode_orientation(obj),
+                            'method': f'enhanced_preprocessing_{i}'
+                        }
+                        if 'databar' in obj.type.lower():
+                            barcode_info['expanded_data'] = self.parse_databar_expanded(obj.data.decode('utf-8', errors='ignore'))
+                        results.append(barcode_info)
+                except Exception as e:
+                    print(f"Error in enhanced preprocessing method {i}: {str(e)}")
+                    continue
+        except Exception as e:
+            print(f"Error in enhanced preprocessing: {str(e)}")
+        return results
+    def detect_small_barcodes_simple(self, image):
+        """Simplified small barcode detection"""
+        results = []
+        try:
+            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+            # Only try 3x and 4x scaling
+            scale_factors = [3.0, 4.0]
+            for scale in scale_factors:
+                try:
+                    height, width = gray.shape
+                    new_height, new_width = int(height * scale), int(width * scale)
+                    scaled = cv2.resize(gray, (new_width, new_height), interpolation=cv2.INTER_CUBIC)
+                    decoded_objects = decode(scaled)
+                    for obj in decoded_objects:
+                        # Scale back coordinates
+                        scale_factor = width / new_width
+                        scaled_rect = type('Rect', (), {
+                            'left': int(obj.rect.left * scale_factor),
+                            'top': int(obj.rect.top * scale_factor),
+                            'width': int(obj.rect.width * scale_factor),
+                            'height': int(obj.rect.height * scale_factor)
+                        })()
+                        barcode_info = {
+                            'type': obj.type,
+                            'data': obj.data.decode('utf-8', errors='ignore'),
+                            'rect': scaled_rect,
+                            'polygon': obj.polygon,
+                            'quality': getattr(obj, 'quality', 0),
+                            'orientation': self.detect_barcode_orientation(obj),
+                            'method': f'small_barcode_{scale}x',
+                            'size_category': 'small'
+                        }
+                        if 'databar' in obj.type.lower():
+                            barcode_info['expanded_data'] = self.parse_databar_expanded(obj.data.decode('utf-8', errors='ignore'))
+                        results.append(barcode_info)
+                except Exception as e:
+                    print(f"Error in small barcode detection at {scale}x: {str(e)}")
+                    continue
+        except Exception as e:
+            print(f"Error in small barcode detection: {str(e)}")
+        return results
+    def preprocess_image_for_ocr(self, image):
+        """Preprocess image for better OCR results"""
+        try:
+            # Convert to grayscale
+            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+            # Apply different preprocessing techniques
+            # 1. Resize image to improve small text recognition
+            height, width = gray.shape
+            scale_factor = 3.0  # Scale up for better small font recognition
+            new_height, new_width = int(height * scale_factor), int(width * scale_factor)
+            resized = cv2.resize(gray, (new_width, new_height), interpolation=cv2.INTER_CUBIC)
+            # 2. Apply Gaussian blur to reduce noise
+            blurred = cv2.GaussianBlur(resized, (1, 1), 0)
+            # 3. Apply adaptive thresholding for better text separation
+            thresh = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
+            # 4. Apply morphological operations to clean up text
+            kernel = np.ones((1, 1), np.uint8)
+            cleaned = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
+            # 5. Apply contrast enhancement
+            clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+            enhanced = clahe.apply(cleaned)
+            return enhanced
+        except Exception as e:
+            print(f"Error preprocessing image: {str(e)}")
+            return image  # Return original if preprocessing fails
+    def preprocess_for_barcode_detection(self, image):
+        """Preprocess image with multiple techniques for better barcode detection"""
+        processed_images = [image]  # Start with original
+        try:
+            # Convert to grayscale
+            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+            processed_images.append(gray)
+            # Apply different preprocessing techniques
+            # 1. Contrast enhancement
+            clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
+            enhanced = clahe.apply(gray)
+            processed_images.append(enhanced)
+            # 2. Gaussian blur for noise reduction
+            blurred = cv2.GaussianBlur(gray, (3, 3), 0)
+            processed_images.append(blurred)
+            # 3. Adaptive thresholding
+            thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
+            processed_images.append(thresh)
+            # 4. Edge enhancement for better barcode detection
+            kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
+            sharpened = cv2.filter2D(gray, -1, kernel)
+            processed_images.append(sharpened)
+            # 5. Scale up for small barcodes
+            height, width = gray.shape
+            scale_factor = 3.0
+            new_height, new_width = int(height * scale_factor), int(width * scale_factor)
+            scaled = cv2.resize(gray, (new_width, new_height), interpolation=cv2.INTER_CUBIC)
+            processed_images.append(scaled)
+        except Exception as e:
+            print(f"Error in barcode preprocessing: {str(e)}")
+        return processed_images
+    def preprocess_for_databar(self, gray_image):
+        """Specialized preprocessing for DataBar Expanded Stacked barcodes"""
+        processed_images = []
+        try:
+            # Original grayscale
+            processed_images.append(gray_image)
+            # 1. High contrast enhancement for DataBar
+            clahe = cv2.createCLAHE(clipLimit=4.0, tileGridSize=(8, 8))
+            enhanced = clahe.apply(gray_image)
+            processed_images.append(enhanced)
+            # 2. Bilateral filter to preserve edges while reducing noise
+            bilateral = cv2.bilateralFilter(gray_image, 9, 75, 75)
+            processed_images.append(bilateral)
+            # 3. Adaptive thresholding with different parameters
+            thresh1 = cv2.adaptiveThreshold(gray_image, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 15, 2)
+            processed_images.append(thresh1)
+            thresh2 = cv2.adaptiveThreshold(gray_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
+            processed_images.append(thresh2)
+            # 4. Scale up for better DataBar detection
+            height, width = gray_image.shape
+            scale_factors = [2.0, 3.0, 4.0]
+            for scale in scale_factors:
+                new_height, new_width = int(height * scale), int(width * scale)
+                scaled = cv2.resize(gray_image, (new_width, new_height), interpolation=cv2.INTER_CUBIC)
+                processed_images.append(scaled)
+            # 5. Edge enhancement specifically for DataBar
+            kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
+            sharpened = cv2.filter2D(gray_image, -1, kernel)
+            processed_images.append(sharpened)
+            # 6. Morphological operations for DataBar
+            kernel = np.ones((2, 2), np.uint8)
+            morphed = cv2.morphologyEx(gray_image, cv2.MORPH_CLOSE, kernel)
+            processed_images.append(morphed)
+        except Exception as e:
+            print(f"Error in DataBar preprocessing: {str(e)}")
+        return processed_images
+    def detect_with_transformations(self, image):
+        """Detect barcodes using multiple image transformations"""
+        results = []
+        try:
+            # Try different rotations
+            angles = [0, 90, 180, 270]
+            for angle in angles:
+                if angle == 0:
+                    rotated_image = image
+                else:
+                    height, width = image.shape[:2]
+                    center = (width // 2, height // 2)
+                    rotation_matrix = cv2.getRotationMatrix2D(center, angle, 1.0)
+                    rotated_image = cv2.warpAffine(image, rotation_matrix, (width, height))
+                # Try to detect barcodes in rotated image
+                try:
+                    decoded_objects = decode(rotated_image)
+                    for obj in decoded_objects:
+                        barcode_info = {
+                            'type': obj.type,
+                            'data': obj.data.decode('utf-8', errors='ignore'),
+                            'rect': obj.rect,
+                            'polygon': obj.polygon,
+                            'quality': getattr(obj, 'quality', 0),
+                            'orientation': f"{angle}°",
+                            'method': f'transform_{angle}deg'
+                        }
+                        # Enhanced DataBar Expanded detection
+                        if 'databar' in obj.type.lower() or 'expanded' in obj.type.lower():
+                            barcode_info['expanded_data'] = self.parse_databar_expanded(obj.data.decode('utf-8', errors='ignore'))
+                        # Check for multi-stack barcodes
+                        if self.is_multi_stack_barcode(obj, rotated_image):
+                            barcode_info['stack_type'] = self.detect_stack_type(obj, rotated_image)
+                        results.append(barcode_info)
+                except Exception as e:
+                    print(f"Error in transformation detection at {angle}°: {str(e)}")
+                    continue
+        except Exception as e:
+            print(f"Error in transformation detection: {str(e)}")
+        return results
+    def detect_small_barcodes(self, image):
+        """Specialized detection for small barcodes and QR codes"""
+        results = []
+        try:
+            # Convert to grayscale
+            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+            # Apply specialized preprocessing for small barcodes
+            processed_images = self.preprocess_for_small_barcodes(gray)
+            for processed_image in processed_images:
+                try:
+                    decoded_objects = decode(processed_image)
+                    for obj in decoded_objects:
+                        # Check if this is a small barcode (less than 50x50 pixels)
+                        if obj.rect.width < 50 or obj.rect.height < 50:
+                            barcode_info = {
+                                'type': obj.type,
+                                'data': obj.data.decode('utf-8', errors='ignore'),
+                                'rect': obj.rect,
+                                'polygon': obj.polygon,
+                                'quality': getattr(obj, 'quality', 0),
+                                'orientation': self.detect_barcode_orientation(obj),
+                                'method': 'small_barcode_detection',
+                                'size_category': 'small'
+                            }
+                            # Enhanced DataBar Expanded detection
+                            if 'databar' in obj.type.lower() or 'expanded' in obj.type.lower():
+                                barcode_info['expanded_data'] = self.parse_databar_expanded(obj.data.decode('utf-8', errors='ignore'))
+                            # Check for multi-stack barcodes
+                            if self.is_multi_stack_barcode(obj, image):
+                                barcode_info['stack_type'] = self.detect_stack_type(obj, image)
+                            results.append(barcode_info)
+                except Exception as e:
+                    print(f"Error in small barcode detection: {str(e)}")
+                    continue
+        except Exception as e:
+            print(f"Error in small barcode preprocessing: {str(e)}")
+        return results
+    def preprocess_for_small_barcodes(self, gray_image):
+        """Specialized preprocessing for small barcodes and QR codes"""
+        processed_images = []
+        try:
+            # Original grayscale
+            processed_images.append(gray_image)
+            # 1. Multiple high-resolution scaling for small barcodes
+            height, width = gray_image.shape
+            scale_factors = [4.0, 5.0, 6.0, 8.0]  # Higher scaling for small barcodes
+            for scale in scale_factors:
+                new_height, new_width = int(height * scale), int(width * scale)
+                scaled = cv2.resize(gray_image, (new_width, new_height), interpolation=cv2.INTER_CUBIC)
+                processed_images.append(scaled)
+            # 2. Aggressive contrast enhancement
+            clahe = cv2.createCLAHE(clipLimit=5.0, tileGridSize=(8, 8))
+            enhanced = clahe.apply(gray_image)
+            processed_images.append(enhanced)
+            # 3. Unsharp masking for edge enhancement
+            gaussian = cv2.GaussianBlur(gray_image, (0, 0), 2.0)
+            unsharp = cv2.addWeighted(gray_image, 1.5, gaussian, -0.5, 0)
+            processed_images.append(unsharp)
+            # 4. Multiple thresholding methods
+            # Otsu's thresholding
+            _, otsu = cv2.threshold(gray_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+            processed_images.append(otsu)
+            # Adaptive thresholding with different parameters
+            adaptive1 = cv2.adaptiveThreshold(gray_image, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 9, 2)
+            processed_images.append(adaptive1)
+            adaptive2 = cv2.adaptiveThreshold(gray_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 7, 2)
+            processed_images.append(adaptive2)
+            # 5. Noise reduction with different methods
+            # Bilateral filter
+            bilateral = cv2.bilateralFilter(gray_image, 9, 75, 75)
+            processed_images.append(bilateral)
+            # Median filter
+            median = cv2.medianBlur(gray_image, 3)
+            processed_images.append(median)
+            # 6. Edge detection and enhancement
+            # Sobel edge detection
+            sobel_x = cv2.Sobel(gray_image, cv2.CV_64F, 1, 0, ksize=3)
+            sobel_y = cv2.Sobel(gray_image, cv2.CV_64F, 0, 1, ksize=3)
+            sobel = np.sqrt(sobel_x**2 + sobel_y**2)
+            sobel = np.uint8(sobel * 255 / sobel.max())
+            processed_images.append(sobel)
+            # 7. Morphological operations for small barcode cleanup
+            kernel = np.ones((2, 2), np.uint8)
+            morphed_close = cv2.morphologyEx(gray_image, cv2.MORPH_CLOSE, kernel)
+            processed_images.append(morphed_close)
+            kernel_open = np.ones((1, 1), np.uint8)
+            morphed_open = cv2.morphologyEx(gray_image, cv2.MORPH_OPEN, kernel_open)
+            processed_images.append(morphed_open)
+        except Exception as e:
+            print(f"Error in small barcode preprocessing: {str(e)}")
+        return processed_images
+    def detect_with_high_resolution(self, image):
+        """Detect barcodes using high-resolution processing"""
+        results = []
+        try:
+            # Convert to grayscale
+            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+            # Process at multiple high resolutions
+            height, width = gray.shape
+            resolutions = [
+                (int(width * 3), int(height * 3)),   # 3x resolution
+                (int(width * 4), int(height * 4)),   # 4x resolution
+                (int(width * 6), int(height * 6))    # 6x resolution
+            ]
+            for new_width, new_height in resolutions:
+                try:
+                    # Resize with high-quality interpolation
+                    resized = cv2.resize(gray, (new_width, new_height), interpolation=cv2.INTER_CUBIC)
+                    # Apply high-resolution preprocessing
+                    processed = self.preprocess_high_resolution(resized)
+                    # Try to detect barcodes
+                    decoded_objects = decode(processed)
+                    for obj in decoded_objects:
+                        # Scale back the coordinates to original image size
+                        scale_factor = width / new_width
+                        scaled_rect = type('Rect', (), {
+                            'left': int(obj.rect.left * scale_factor),
+                            'top': int(obj.rect.top * scale_factor),
+                            'width': int(obj.rect.width * scale_factor),
+                            'height': int(obj.rect.height * scale_factor)
+                        })()
+                        barcode_info = {
+                            'type': obj.type,
+                            'data': obj.data.decode('utf-8', errors='ignore'),
+                            'rect': scaled_rect,
+                            'polygon': obj.polygon,
+                            'quality': getattr(obj, 'quality', 0),
+                            'orientation': self.detect_barcode_orientation(obj),
+                            'method': f'high_res_{new_width}x{new_height}',
+                            'resolution': f'{new_width}x{new_height}'
+                        }
+                        # Enhanced DataBar Expanded detection
+                        if 'databar' in obj.type.lower() or 'expanded' in obj.type.lower():
+                            barcode_info['expanded_data'] = self.parse_databar_expanded(obj.data.decode('utf-8', errors='ignore'))
+                        # Check for multi-stack barcodes
+                        if self.is_multi_stack_barcode(obj, image):
+                            barcode_info['stack_type'] = self.detect_stack_type(obj, image)
+                        results.append(barcode_info)
+                except Exception as e:
+                    print(f"Error in high-resolution detection at {new_width}x{new_height}: {str(e)}")
+                    continue
+        except Exception as e:
+            print(f"Error in high-resolution detection: {str(e)}")
+        return results
+    def preprocess_high_resolution(self, image):
+        """Preprocessing optimized for high-resolution images"""
+        try:
+            # 1. High-quality noise reduction
+            denoised = cv2.fastNlMeansDenoising(image)
+            # 2. Advanced contrast enhancement
+            clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
+            enhanced = clahe.apply(denoised)
+            # 3. Edge-preserving smoothing
+            bilateral = cv2.bilateralFilter(enhanced, 9, 75, 75)
+            # 4. Sharpening
+            kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
+            sharpened = cv2.filter2D(bilateral, -1, kernel)
+            # 5. Adaptive thresholding for high-res
+            thresh = cv2.adaptiveThreshold(sharpened, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
+            return thresh
+        except Exception as e:
+            print(f"Error in high-resolution preprocessing: {str(e)}")
+            return image
+    def detect_barcode_orientation(self, barcode_obj):
+        """Detect the orientation of the barcode"""
+        try:
+            if hasattr(barcode_obj, 'polygon') and len(barcode_obj.polygon) >= 4:
+                # Calculate orientation based on polygon points
+                points = np.array(barcode_obj.polygon)
+                # Calculate the angle of the longest edge
+                edges = []
+                for i in range(4):
+                    p1 = points[i]
+                    p2 = points[(i + 1) % 4]
+                    edge_length = np.linalg.norm(p2 - p1)
+                    angle = np.arctan2(p2[1] - p1[1], p2[0] - p1[0]) * 180 / np.pi
+                    edges.append((edge_length, angle))
+                # Find the longest edge (likely the main barcode direction)
+                longest_edge = max(edges, key=lambda x: x[0])
+                return f"{longest_edge[1]:.1f}°"
+            return "Unknown"
+        except:
+            return "Unknown"
+    def parse_databar_expanded(self, data):
+        """Parse DataBar Expanded barcode data"""
+        try:
+            # DataBar Expanded can contain multiple data fields
+            # Format: [01]12345678901234[3101]123[3102]456
+            parsed_data = {}
+            # Extract GS1 Application Identifiers
+            ai_pattern = r'\[(\d{2,4})\]([^\[]+)'
+            matches = re.findall(ai_pattern, data)
+            for ai, value in matches:
+                parsed_data[f"AI {ai}"] = value
+            # If no AI pattern found, return original data
+            if not parsed_data:
+                parsed_data["Raw Data"] = data
+            return parsed_data
+        except Exception as e:
+            return {"Raw Data": data, "Parse Error": str(e)}
+    def is_multi_stack_barcode(self, barcode_obj, image):
+        """Detect if this is a multi-stack barcode"""
+        try:
+            if hasattr(barcode_obj, 'rect'):
+                x, y, w, h = barcode_obj.rect
+                # Check if the barcode is unusually tall (indicating stacked format)
+                aspect_ratio = h / w if w > 0 else 0
+                # DataBar Expanded and other stacked barcodes typically have aspect ratios > 0.3
+                return aspect_ratio > 0.3
+        except:
+            pass
+        return False
+    def detect_stack_type(self, barcode_obj, image):
+        """Detect the type of multi-stack barcode"""
+        try:
+            if hasattr(barcode_obj, 'rect'):
+                x, y, w, h = barcode_obj.rect
+                aspect_ratio = h / w if w > 0 else 0
+                # Classify based on aspect ratio and barcode type
+                if 'databar' in barcode_obj.type.lower():
+                    if aspect_ratio > 0.5:
+                        return "Quad Stack"
+                    elif aspect_ratio > 0.35:
+                        return "Triple Stack"
+                    elif aspect_ratio > 0.25:
+                        return "Double Stack"
+                    else:
+                        return "Single Stack"
+                else:
+                    # For other barcode types
+                    if aspect_ratio > 0.4:
+                        return "Multi-Stack"
+                    else:
+                        return "Single Stack"
+        except:
+            pass
+        return "Unknown"
+    def remove_duplicate_barcodes(self, barcodes):
+        """Remove duplicate barcodes based on position and data"""
+        unique_barcodes = []
+        seen_positions = set()
+        seen_data = set()
+        for barcode in barcodes:
+            # Create position signature
+            pos_signature = f"{barcode['rect'].left},{barcode['rect'].top},{barcode['rect'].width},{barcode['rect'].height}"
+            data_signature = barcode['data']
+            # Check if we've seen this position or data before
+            if pos_signature not in seen_positions and data_signature not in seen_data:
+                unique_barcodes.append(barcode)
+                seen_positions.add(pos_signature)
+                seen_data.add(data_signature)
+        return unique_barcodes
+    def enhance_barcode_data(self, barcodes):
+        """Enhance barcode data with additional analysis"""
+        enhanced_barcodes = []
+        for barcode in barcodes:
+            # Add confidence score based on method and quality
+            confidence = self.calculate_confidence(barcode)
+            barcode['confidence'] = confidence
+            # Add GS1 validation for DataBar
+            if 'databar' in barcode['type'].lower():
+                barcode['gs1_validated'] = self.validate_gs1_format(barcode['data'])
+            enhanced_barcodes.append(barcode)
+        return enhanced_barcodes
+    def calculate_confidence(self, barcode):
+        """Calculate confidence score for barcode detection"""
+        confidence = 50  # Base confidence
+        # Method confidence
+        method_scores = {
+            'pyzbar_basic': 70,
+            'pyzbar_enhanced': 70,
+            'dynamsoft': 85,  # Dynamsoft typically has higher accuracy
+            'enhanced_preprocessing_0': 65,
+            'enhanced_preprocessing_1': 60,
+            'enhanced_preprocessing_2': 55,
+            'transform_0deg': 60,
+            'transform_90deg': 50,
+            'transform_180deg': 50,
+            'transform_270deg': 50,
+            'small_barcode_detection': 75,
+            'high_res_2x': 70,
+            'high_res_3x': 65,
+            'high_res_4x': 60
+        }
+        if barcode.get('method') in method_scores:
+            confidence += method_scores[barcode['method']]
+        # Quality score
+        if barcode.get('quality', 0) > 0:
+            confidence += min(barcode['quality'], 20)
+        # DataBar specific confidence
+        if 'databar' in barcode['type'].lower():
+            confidence += 10
+        return min(confidence, 100)
+    def validate_gs1_format(self, data):
+        """Validate GS1 format for DataBar data"""
+        try:
+            # Check for GS1 Application Identifiers
+            ai_pattern = r'\[(\d{2,4})\]'
+            matches = re.findall(ai_pattern, data)
+            if matches:
+                return True
+            # Check for parentheses format
+            ai_pattern_parens = r'\((\d{2,4})\)'
+            matches_parens = re.findall(ai_pattern_parens, data)
+            return len(matches_parens) > 0
+        except:
+            return False
+    def check_spelling(self, text):
+        """
+        Robust EN/FR spell check:
+        - Unicode-aware tokens (keeps accents)
+        - Normalizes curly quotes/ligatures
+        - Heuristic per-token language (accented => FR; else EN)
+        - Flags if unknown in its likely language (not both)
+        """
+        try:
+            # normalize ligatures & curly quotes
+            text = unicodedata.normalize("NFKC", text)
+            text = text.replace("'", "'").replace(""", '"').replace(""", '"')
+            # unicode letters with internal ' or - allowed
+            tokens = _re.findall(TOKEN_PATTERN, text, flags=_re.UNICODE if _USE_REGEX else 0)
+            issues = []
+            for raw in tokens:
+                t = raw.lower()
+                # skip very short, short ALL-CAPS acronyms, and whitelisted terms
+                if len(t) < 3:
+                    continue
+                if raw.isupper() and len(raw) <= 3:
+                    continue
+                if t in DOMAIN_WHITELIST:
+                    continue
+                miss_en = t in self.english_spellchecker.unknown([t])
+                miss_fr = t in self.french_spellchecker.unknown([t])
+                use_fr = _likely_french(raw)
+                # Prefer the likely language, but fall back to "either language unknown"
+                if (use_fr and miss_fr) or ((not use_fr) and miss_en) or (miss_en and miss_fr):
+                    issues.append({
+                        "word": raw,
+                        "lang": "fr" if use_fr else "en",
+                        "suggestions_en": list(self.english_spellchecker.candidates(t))[:3],
+                        "suggestions_fr": list(self.french_spellchecker.candidates(t))[:3],
+                    })
+            return issues
+        except Exception as e:
+            print(f"Error checking spelling: {e}")
+            return []
+    def compare_colors(self, image1, image2):
+        """Compare colors between two images and return differences using RGB color space"""
+        try:
+            print("Starting RGB color comparison...")
+            # Convert images to same size
+            img1 = np.array(image1)
+            img2 = np.array(image2)
+            print(f"Image 1 shape: {img1.shape}")
+            print(f"Image 2 shape: {img2.shape}")
+            # Resize images to same dimensions
+            height = min(img1.shape[0], img2.shape[0])
+            width = min(img1.shape[1], img2.shape[1])
+            img1_resized = cv2.resize(img1, (width, height))
+            img2_resized = cv2.resize(img2, (width, height))
+            print(f"Resized to: {width}x{height}")
+            # Keep images in RGB format (no conversion to BGR)
+            img1_rgb = img1_resized
+            img2_rgb = img2_resized
+            color_differences = []
+            # Method 1: Enhanced RGB channel comparison with 20% more accuracy
+            print("Method 1: Enhanced RGB channel comparison")
+            # Calculate absolute difference for each RGB channel with enhanced precision
+            diff_r = cv2.absdiff(img1_rgb[:,:,0], img2_rgb[:,:,0])  # Red channel
+            diff_g = cv2.absdiff(img1_rgb[:,:,1], img2_rgb[:,:,1])  # Green channel
+            diff_b = cv2.absdiff(img1_rgb[:,:,2], img2_rgb[:,:,2])  # Blue channel
+            # Enhanced RGB combination with better weighting
+            diff_combined = cv2.addWeighted(diff_r, 0.4, diff_g, 0.4, 0)  # Red and Green weighted higher
+            diff_combined = cv2.addWeighted(diff_combined, 1.0, diff_b, 0.2, 0)  # Blue weighted lower
+            # Apply Gaussian blur to reduce noise and improve accuracy
+            diff_combined = cv2.GaussianBlur(diff_combined, (3, 3), 0)
+            # Apply balanced thresholds to catch color variations while avoiding multiple boxes
+            rgb_thresholds = [15, 22, 30, 40]  # Balanced thresholds
+            for threshold in rgb_thresholds:
+                _, thresh = cv2.threshold(diff_combined, threshold, 255, cv2.THRESH_BINARY)
+                # Apply minimal morphological operations
+                kernel = np.ones((1, 1), np.uint8)  # Minimal kernel to preserve detail
+                thresh = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
+                thresh = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel)
+                # Find contours
+                contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                print(f"RGB Threshold {threshold}: Found {len(contours)} contours")
+                for contour in contours:
+                    area = cv2.contourArea(contour)
+                    if area > 15:  # Balanced area threshold to catch variations while avoiding small boxes
+                        x, y, w, h = cv2.boundingRect(contour)
+                        # Get the actual RGB colors at this location
+                        color1 = img1_rgb[y:y+h, x:x+w].mean(axis=(0, 1))
+                        color2 = img2_rgb[y:y+h, x:x+w].mean(axis=(0, 1))
+                        # Calculate RGB color difference magnitude
+                        color_diff = np.linalg.norm(color1 - color2)
+                        # Flag moderate color differences
+                        if color_diff > 18:  # Balanced threshold
+                            # Check if this area is already covered (refined consolidated problem areas)
+                            already_covered = False
+                            for existing_diff in color_differences:
+                                if (abs(existing_diff['x'] - x) < 21 and
+                                    abs(existing_diff['y'] - y) < 21 and
+                                    abs(existing_diff['width'] - w) < 21 and
+                                    abs(existing_diff['height'] - h) < 21):
+                                    already_covered = True
+                                    break
+                            if not already_covered:
+                                color_differences.append({
+                                    'x': x,
+                                    'y': y,
+                                    'width': w,
+                                    'height': h,
+                                    'area': area,
+                                    'color1': color1.tolist(),
+                                    'color2': color2.tolist(),
+                                    'threshold': f"RGB_{threshold}",
+                                    'color_diff': color_diff,
+                                    'diff_r': float(abs(color1[0] - color2[0])),
+                                    'diff_g': float(abs(color1[1] - color2[1])),
+                                    'diff_b': float(abs(color1[2] - color2[2]))
+                                })
+            # Method 2: Enhanced HSV color space comparison with 20% more accuracy
+            print("Method 2: Enhanced HSV color space comparison")
+            # Convert to HSV for better color difference detection
+            img1_hsv = cv2.cvtColor(img1_rgb, cv2.COLOR_RGB2HSV)
+            img2_hsv = cv2.cvtColor(img2_rgb, cv2.COLOR_RGB2HSV)
+            # Enhanced HSV comparison with better channel weighting
+            hue_diff = cv2.absdiff(img1_hsv[:,:,0], img2_hsv[:,:,0])  # Hue channel
+            sat_diff = cv2.absdiff(img1_hsv[:,:,1], img2_hsv[:,:,1])  # Saturation channel
+            val_diff = cv2.absdiff(img1_hsv[:,:,2], img2_hsv[:,:,2])  # Value channel
+            # Enhanced HSV combination with better weighting
+            hsv_combined = cv2.addWeighted(hue_diff, 0.5, sat_diff, 0.3, 0)  # Hue and Saturation
+            hsv_combined = cv2.addWeighted(hsv_combined, 1.0, val_diff, 0.2, 0)  # Add Value channel
+            # Apply Gaussian blur to reduce noise and improve accuracy
+            hsv_combined = cv2.GaussianBlur(hsv_combined, (3, 3), 0)
+            # Apply balanced HSV thresholds to catch color variations while avoiding multiple boxes
+            hsv_thresholds = [18, 25, 35, 45]  # Balanced HSV thresholds
+            for threshold in hsv_thresholds:
+                _, hsv_thresh = cv2.threshold(hsv_combined, threshold, 255, cv2.THRESH_BINARY)
+                # Apply minimal morphological operations
+                kernel = np.ones((1, 1), np.uint8)
+                hsv_thresh = cv2.morphologyEx(hsv_thresh, cv2.MORPH_CLOSE, kernel)
+                hsv_thresh = cv2.morphologyEx(hsv_thresh, cv2.MORPH_OPEN, kernel)
+                # Find contours
+                hsv_contours, _ = cv2.findContours(hsv_thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                print(f"HSV Threshold {threshold}: Found {len(hsv_contours)} contours")
+                for contour in hsv_contours:
+                    area = cv2.contourArea(contour)
+                    if area > 15:  # Balanced area threshold to catch variations while avoiding small boxes
+                        x, y, w, h = cv2.boundingRect(contour)
+                        # Get the actual colors at this location
+                        color1 = img1_rgb[y:y+h, x:x+w].mean(axis=(0, 1))
+                        color2 = img2_rgb[y:y+h, x:x+w].mean(axis=(0, 1))
+                        # Calculate color difference magnitude
+                        color_diff = np.linalg.norm(color1 - color2)
+                        # Flag moderate color differences
+                        if color_diff > 22:  # Balanced threshold
+                            # Check if this area is already covered (refined consolidated problem areas)
+                            already_covered = False
+                            for existing_diff in color_differences:
+                                if (abs(existing_diff['x'] - x) < 21 and
+                                    abs(existing_diff['y'] - y) < 21 and
+                                    abs(existing_diff['width'] - w) < 21 and
+                                    abs(existing_diff['height'] - h) < 21):
+                                    already_covered = True
+                                    break
+                            if not already_covered:
+                                color_differences.append({
+                                    'x': x,
+                                    'y': y,
+                                    'width': w,
+                                    'height': h,
+                                    'area': area,
+                                    'color1': color1.tolist(),
+                                    'color2': color2.tolist(),
+                                    'threshold': f"HSV_{threshold}",
+                                    'color_diff': color_diff,
+                                    'diff_r': float(abs(color1[0] - color2[0])),
+                                    'diff_g': float(abs(color1[1] - color2[1])),
+                                    'diff_b': float(abs(color1[2] - color2[2]))
+                                })
+            # Method 3: Enhanced pixel-by-pixel RGB comparison with 20% more accuracy
+            print("Method 3: Enhanced pixel-by-pixel RGB comparison")
+            # Sample every 12th pixel for less sensitivity (20% less frequent)
+            for y in range(0, height, 12):
+                for x in range(0, width, 12):
+                    color1 = img1_rgb[y, x]
+                    color2 = img2_rgb[y, x]
+                    # Calculate absolute difference for each RGB channel
+                    diff_r = abs(int(color1[0]) - int(color2[0]))  # Red channel
+                    diff_g = abs(int(color1[1]) - int(color2[1]))  # Green channel
+                    diff_b = abs(int(color1[2]) - int(color2[2]))  # Blue channel
+                                        # Flag if RGB channels differ by moderate amounts
+                    if diff_r > 10 or diff_g > 10 or diff_b > 10:
+                        # Check if this area is already covered (refined consolidated problem areas)
+                        already_covered = False
+                        for existing_diff in color_differences:
+                            if (abs(existing_diff['x'] - x) < 21 and
+                                abs(existing_diff['y'] - y) < 21):
+                                already_covered = True
+                                break
+                        if not already_covered:
+                            color_differences.append({
+                                'x': x,
+                                'y': y,
+                                'width': 5,  # Small box around the pixel
+                                'height': 5,
+                                'area': 25,
+                                'color1': color1.tolist(),
+                                'color2': color2.tolist(),
+                                'threshold': 'pixel_RGB',
+                                'color_diff': diff_r + diff_g + diff_b,
+                                'diff_r': diff_r,
+                                'diff_g': diff_g,
+                                'diff_b': diff_b
+                            })
+            print(f"RGB color comparison completed. Found {len(color_differences)} total differences.")
+            # Method 4: LAB color space comparison for perceptual accuracy (20% more accurate)
+            print("Method 4: LAB color space comparison")
+            # Convert to LAB color space for perceptual color differences
+            img1_lab = cv2.cvtColor(img1_rgb, cv2.COLOR_RGB2LAB)
+            img2_lab = cv2.cvtColor(img2_rgb, cv2.COLOR_RGB2LAB)
+            # Calculate LAB differences (perceptually uniform)
+            lab_diff_l = cv2.absdiff(img1_lab[:,:,0], img2_lab[:,:,0])  # L channel (lightness)
+            lab_diff_a = cv2.absdiff(img1_lab[:,:,1], img2_lab[:,:,1])  # a channel (green-red)
+            lab_diff_b = cv2.absdiff(img1_lab[:,:,2], img2_lab[:,:,2])  # b channel (blue-yellow)
+            # Combine LAB differences with perceptual weighting
+            lab_combined = cv2.addWeighted(lab_diff_l, 0.3, lab_diff_a, 0.35, 0)  # L and a channels
+            lab_combined = cv2.addWeighted(lab_combined, 1.0, lab_diff_b, 0.35, 0)  # Add b channel
+            # Apply Gaussian blur for noise reduction
+            lab_combined = cv2.GaussianBlur(lab_combined, (3, 3), 0)
+            # Apply balanced LAB thresholds to catch color variations while avoiding multiple boxes
+            lab_thresholds = [20, 28, 38, 50]  # Balanced LAB thresholds
+            for threshold in lab_thresholds:
+                _, lab_thresh = cv2.threshold(lab_combined, threshold, 255, cv2.THRESH_BINARY)
+                # Apply morphological operations
+                kernel = np.ones((1, 1), np.uint8)
+                lab_thresh = cv2.morphologyEx(lab_thresh, cv2.MORPH_CLOSE, kernel)
+                lab_thresh = cv2.morphologyEx(lab_thresh, cv2.MORPH_OPEN, kernel)
+                # Find contours
+                lab_contours, _ = cv2.findContours(lab_thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                print(f"LAB Threshold {threshold}: Found {len(lab_contours)} contours")
+                for contour in lab_contours:
+                    area = cv2.contourArea(contour)
+                    if area > 15:  # Balanced area threshold to catch variations while avoiding small boxes
+                        x, y, w, h = cv2.boundingRect(contour)
+                        # Get the actual colors at this location
+                        color1 = img1_rgb[y:y+h, x:x+w].mean(axis=(0, 1))
+                        color2 = img2_rgb[y:y+h, x:x+w].mean(axis=(0, 1))
+                        # Calculate color difference magnitude
+                        color_diff = np.linalg.norm(color1 - color2)
+                        # Flag moderate color differences
+                        if color_diff > 22:  # Balanced threshold
+                            # Check if this area is already covered (refined consolidated problem areas)
+                            already_covered = False
+                            for existing_diff in color_differences:
+                                if (abs(existing_diff['x'] - x) < 21 and
+                                    abs(existing_diff['y'] - y) < 21 and
+                                    abs(existing_diff['width'] - w) < 21 and
+                                    abs(existing_diff['height'] - h) < 21):
+                                    already_covered = True
+                                    break
+                            if not already_covered:
+                                color_differences.append({
+                                    'x': x,
+                                    'y': y,
+                                    'width': w,
+                                    'height': h,
+                                    'area': area,
+                                    'color1': color1.tolist(),
+                                    'color2': color2.tolist(),
+                                    'threshold': f"LAB_{threshold}",
+                                    'color_diff': color_diff,
+                                    'diff_r': float(abs(color1[0] - color2[0])),
+                                    'diff_g': float(abs(color1[1] - color2[1])),
+                                    'diff_b': float(abs(color1[2] - color2[2]))
+                                })
+            print(f"Enhanced color comparison completed. Found {len(color_differences)} total differences.")
+            # Group nearby differences into one perimeter box per issue area
+            if color_differences:
+                grouped_differences = self.group_nearby_differences(color_differences)
+                print(f"Grouped into {len(grouped_differences)} perimeter boxes")
+                return grouped_differences
+            return color_differences
+        except Exception as e:
+            print(f"Error comparing colors: {str(e)}")
+            return []
+    def group_nearby_differences(self, differences):
+        """Group nearby differences into larger bounding boxes around affected areas"""
+        if not differences:
+            return []
+        # Sort differences by position for easier grouping
+        sorted_diffs = sorted(differences, key=lambda x: (x['y'], x['x']))
+        grouped_areas = []
+        current_group = []
+        for diff in sorted_diffs:
+            if not current_group:
+                current_group = [diff]
+            else:
+                # Check if this difference is close to the current group
+                should_group = False
+                for group_diff in current_group:
+                    # Calculate distance between centers
+                    center1_x = group_diff['x'] + group_diff['width'] // 2
+                    center1_y = group_diff['y'] + group_diff['height'] // 2
+                    center2_x = diff['x'] + diff['width'] // 2
+                    center2_y = diff['y'] + diff['height'] // 2
+                    distance = ((center1_x - center2_x) ** 2 + (center1_y - center2_y) ** 2) ** 0.5
+                    # If distance is less than 200 pixels, group them for one box per main issue
+                    if distance < 200:
+                        should_group = True
+                        break
+                if should_group:
+                    current_group.append(diff)
+                else:
+                    # Create bounding box for current group
+                    if current_group:
+                        bounding_box = self.create_group_bounding_box(current_group)
+                        if bounding_box:  # Only add if not None
+                            grouped_areas.append(bounding_box)
+                    current_group = [diff]
+        # Don't forget the last group
+        if current_group:
+            bounding_box = self.create_group_bounding_box(current_group)
+            if bounding_box:  # Only add if not None
+                grouped_areas.append(bounding_box)
+        return grouped_areas
+    def group_nearby_differences(self, differences):
+        """Group nearby differences into one perimeter box per issue area"""
+        if not differences:
+            return []
+        # Sort differences by position for easier grouping
+        sorted_diffs = sorted(differences, key=lambda x: (x['y'], x['x']))
+        grouped_areas = []
+        current_group = []
+        for diff in sorted_diffs:
+            if not current_group:
+                current_group = [diff]
+            else:
+                # Check if this difference is close to the current group
+                should_group = False
+                for group_diff in current_group:
+                    # Calculate distance between centers
+                    center1_x = group_diff['x'] + group_diff['width'] // 2
+                    center1_y = group_diff['y'] + group_diff['height'] // 2
+                    center2_x = diff['x'] + diff['width'] // 2
+                    center2_y = diff['y'] + diff['height'] // 2
+                    distance = ((center1_x - center2_x) ** 2 + (center1_y - center2_y) ** 2) ** 0.5
+                    # If distance is less than 234 pixels, group them for refined consolidated problem areas
+                    if distance < 234:
+                        should_group = True
+                        break
+                if should_group:
+                    current_group.append(diff)
+                else:
+                    # Create perimeter box for current group
+                    if current_group:
+                        perimeter_box = self.create_perimeter_box(current_group)
+                        if perimeter_box:  # Only add if not None
+                            grouped_areas.append(perimeter_box)
+                    current_group = [diff]
+        # Don't forget the last group
+        if current_group:
+            perimeter_box = self.create_perimeter_box(current_group)
+            if perimeter_box:  # Only add if not None
+                grouped_areas.append(perimeter_box)
+        return grouped_areas
+    def create_perimeter_box(self, group):
+        """Create a perimeter box that encompasses all differences in a group"""
+        if not group:
+            return None
+        # Find the overall bounding box
+        min_x = min(diff['x'] - 5 for diff in group)  # Include 5-pixel extension
+        min_y = min(diff['y'] - 5 for diff in group)  # Include 5-pixel extension
+        max_x = max(diff['x'] + diff['width'] + 5 for diff in group)  # Include 5-pixel extension
+        max_y = max(diff['y'] + diff['height'] + 5 for diff in group)  # Include 5-pixel extension
+        # Add minimal padding around the perimeter box (refined consolidated problem areas)
+        padding = 7
+        min_x = max(0, min_x - padding)
+        min_y = max(0, min_y - padding)
+        max_x = max_x + padding
+        max_y = max_y + padding
+        # Calculate final dimensions
+        width = max_x - min_x
+        height = max_y - min_y
+        # Filter out very small groups (refined consolidated problem areas)
+        if width < 26 or height < 26:
+            return None
+        return {
+            'x': min_x,
+            'y': min_y,
+            'width': width,
+            'height': height,
+            'area': width * height,
+            'color1': [0, 0, 0],  # Placeholder
+            'color2': [0, 0, 0],  # Placeholder
+            'threshold': 'perimeter',
+            'color_diff': 1.0,
+            'num_original_differences': len(group)
+        }
+    def create_annotated_image(self, image, differences, output_path):
+        """Create annotated image with red boxes around differences"""
+        try:
+            print(f"Creating annotated image: {output_path}")
+            print(f"Number of differences to annotate: {len(differences)}")
+            # Create a copy of the image
+            annotated_image = image.copy()
+            draw = ImageDraw.Draw(annotated_image)
+            # Draw red rectangles around differences
+            for i, diff in enumerate(differences):
+                x, y, w, h = diff['x'], diff['y'], diff['width'], diff['height']
+                # Draw thicker red rectangle
+                draw.rectangle([x, y, x + w, y + h], outline='red', width=5)
+                print(f"Drawing rectangle {i+1}: ({x}, {y}) to ({x+w}, {y+h})")
+            # Save annotated image
+            annotated_image.save(output_path)
+            print(f"Annotated image saved successfully: {output_path}")
+        except Exception as e:
+            print(f"Error creating annotated image: {str(e)}")
+            # Try to save the original image as fallback
+            try:
+                image.save(output_path)
+                print(f"Saved original image as fallback: {output_path}")
+            except Exception as e2:
+                print(f"Failed to save fallback image: {str(e2)}")
+    def compare_pdfs(self, pdf1_path, pdf2_path, session_id):
+        """Main comparison function with improved error handling"""
+        try:
+            print("Starting PDF comparison...")
+            start_time = time.time()
+            # Validate both PDFs contain "50 Carroll"
+            print("Validating PDF 1...")
+            if not self.validate_pdf(pdf1_path):
+                raise Exception("INVALID DOCUMENT")
+            print("Validating PDF 2...")
+            if not self.validate_pdf(pdf2_path):
+                raise Exception("INVALID DOCUMENT")
+            # Extract text and images from both PDFs
+            print("Extracting text from PDF 1...")
+            pdf1_data = self.extract_text_from_pdf(pdf1_path)
+            if not pdf1_data:
+                raise Exception("INVALID DOCUMENT")
+            print("Extracting text from PDF 2...")
+            pdf2_data = self.extract_text_from_pdf(pdf2_path)
+            if not pdf2_data:
+                raise Exception("INVALID DOCUMENT")
+            # Initialize results
+            results = {
+                'session_id': session_id,
+                'validation': {
+                    'pdf1_valid': True,
+                    'pdf2_valid': True,
+                    'validation_text': '50 Carroll'
+                },
+                'text_comparison': [],
+                'spelling_issues': [],
+                'barcodes_qr_codes': [],
+                'color_differences': [],
+                'annotated_images': []
+            }
+            # Compare text and check spelling
+            print("Processing pages...")
+            for i, (page1, page2) in enumerate(zip(pdf1_data, pdf2_data)):
+                print(f"Processing page {i + 1}...")
+                page_results = {
+                    'page': i + 1,
+                    'text_differences': [],
+                    'spelling_issues_pdf1': [],
+                    'spelling_issues_pdf2': [],
+                    'barcodes_pdf1': [],
+                    'barcodes_pdf2': [],
+                    'color_differences': []
+                }
+                # Check spelling for both PDFs
+                print(f"Checking spelling for page {i + 1}...")
+                page_results['spelling_issues_pdf1'] = self.check_spelling(page1['text'])
+                page_results['spelling_issues_pdf2'] = self.check_spelling(page2['text'])
+                # Add spelling issues to text differences for UI visibility
+                if page_results['spelling_issues_pdf1'] or page_results['spelling_issues_pdf2']:
+                    page_results['text_differences'].append({
+                        "type": "spelling",
+                        "pdf1": [i["word"] for i in page_results['spelling_issues_pdf1']],
+                        "pdf2": [i["word"] for i in page_results['spelling_issues_pdf2']],
+                    })
+                # Create spelling-only annotated images (one box per error)
+                spell_dir = f'static/results/{session_id}'
+                os.makedirs(spell_dir, exist_ok=True)
+                spell_img1 = page1['image'].copy()
+                spell_img2 = page2['image'].copy()
+                spell_img1 = self.annotate_spelling_errors_on_image(spell_img1, page_results['spelling_issues_pdf1'])
+                spell_img2 = self.annotate_spelling_errors_on_image(spell_img2, page_results['spelling_issues_pdf2'])
+                spell_path1 = f'{spell_dir}/page_{i+1}_pdf1_spelling.png'
+                spell_path2 = f'{spell_dir}/page_{i+1}_pdf2_spelling.png'
+                spell_img1.save(spell_path1)
+                spell_img2.save(spell_path2)
+                # link them into the results for your UI
+                page_results.setdefault('annotated_images', {})
+                page_results['annotated_images'].update({
+                    'pdf1_spelling': f'results/{session_id}/page_{i+1}_pdf1_spelling.png',
+                    'pdf2_spelling': f'results/{session_id}/page_{i+1}_pdf2_spelling.png',
+                })
+                # Detect barcodes and QR codes
+                print(f"Detecting barcodes for page {i + 1} PDF 1...")
+                page_results['barcodes_pdf1'] = self.detect_barcodes_qr_codes(page1['image']) or []
+                print(f"Detecting barcodes for page {i + 1} PDF 2...")
+                page_results['barcodes_pdf2'] = self.detect_barcodes_qr_codes(page2['image']) or []
+                # Compare colors
+                print(f"Comparing colors for page {i + 1}...")
+                color_diffs = self.compare_colors(page1['image'], page2['image'])
+                page_results['color_differences'] = color_diffs
+                # Create annotated images and save original images
+                print(f"Creating images for page {i + 1}...")
+                output_dir = f'static/results/{session_id}'
+                os.makedirs(output_dir, exist_ok=True)
+                # Save original images
+                original_path1 = f'{output_dir}/page_{i+1}_pdf1_original.png'
+                original_path2 = f'{output_dir}/page_{i+1}_pdf2_original.png'
+                page1['image'].save(original_path1)
+                page2['image'].save(original_path2)
+                # Create annotated images if there are color differences
+                if color_diffs:
+                    print(f"Creating annotated images for page {i + 1}...")
+                    annotated_path1 = f'{output_dir}/page_{i+1}_pdf1_annotated.png'
+                    annotated_path2 = f'{output_dir}/page_{i+1}_pdf2_annotated.png'
+                    self.create_annotated_image(page1['image'], color_diffs, annotated_path1)
+                    self.create_annotated_image(page2['image'], color_diffs, annotated_path2)
+                    page_results['annotated_images'] = {
+                        'pdf1': f'results/{session_id}/page_{i+1}_pdf1_annotated.png',
+                        'pdf2': f'results/{session_id}/page_{i+1}_pdf2_annotated.png'
+                    }
+                else:
+                    # If no color differences, use original images
+                    page_results['annotated_images'] = {
+                        'pdf1': f'results/{session_id}/page_{i+1}_pdf1_original.png',
+                        'pdf2': f'results/{session_id}/page_{i+1}_pdf2_original.png'
+                    }
+                results['text_comparison'].append(page_results)
+            # Aggregate spelling issues
+            print("Aggregating results...")
+            all_spelling_issues = []
+            for page in results['text_comparison']:
+                all_spelling_issues.extend(page['spelling_issues_pdf1'])
+                all_spelling_issues.extend(page['spelling_issues_pdf2'])
+            results['spelling_issues'] = all_spelling_issues
+            # Aggregate barcodes and QR codes
+            all_barcodes = []
+            for page in results['text_comparison']:
+                all_barcodes.extend(page['barcodes_pdf1'])
+                all_barcodes.extend(page['barcodes_pdf2'])
+            results['barcodes_qr_codes'] = all_barcodes
+            elapsed_time = time.time() - start_time
+            print(f"PDF comparison completed in {elapsed_time:.2f} seconds.")
+            return results
+        except Exception as e:
+            print(f"Error in PDF comparison: {str(e)}")
+            raise Exception(f"INVALID DOCUMENT")
+# Enhanced OCR for tiny fonts - deployment check
+# Force rebuild - Thu Sep  4 09:33:44 EDT 2025

ProofCheck/requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+Flask==2.3.3
+Werkzeug==2.3.7
+PyPDF2==3.0.1
+pdf2image==1.16.3
+Pillow==10.0.1
+opencv-python==4.8.1.78
+pytesseract==0.3.10
+pyzbar==0.1.9
+pyspellchecker==0.7.2
+nltk==3.8.1
+numpy==1.24.3
+scikit-image==0.21.0
+matplotlib==3.7.2
+pandas==2.0.3
+reportlab==4.0.4
+python-barcode==0.15.1
+zxing-cpp==2.0.0
+dbr==9.6.30
+PyMuPDF==1.23.8
+regex==2023.10.3

ProofCheck/run.py ADDED Viewed

	@@ -0,0 +1,123 @@

+#!/usr/bin/env python3
+"""
+Startup script for PDF Comparison Tool
+"""
+import os
+import sys
+import subprocess
+import webbrowser
+import time
+from pathlib import Path
+def check_python_version():
+    """Check if Python version is compatible"""
+    if sys.version_info < (3, 7):
+        print("❌ Python 3.7 or higher is required")
+        print(f"Current version: {sys.version}")
+        return False
+    print(f"✅ Python {sys.version.split()[0]} is compatible")
+    return True
+def check_dependencies():
+    """Check if required dependencies are installed"""
+    try:
+        import flask
+        import cv2
+        import numpy
+        import PIL
+        import pytesseract
+        import pdf2image
+        import pyzbar
+        import spellchecker
+        import nltk
+        import skimage
+        print("✅ All Python dependencies are installed")
+        return True
+    except ImportError as e:
+        print(f"❌ Missing dependency: {e}")
+        print("Please run: pip install -r requirements.txt")
+        return False
+def check_tesseract():
+    """Check if Tesseract OCR is installed"""
+    try:
+        import pytesseract
+        pytesseract.get_tesseract_version()
+        print("✅ Tesseract OCR is available")
+        return True
+    except Exception as e:
+        print(f"❌ Tesseract OCR not found: {e}")
+        print("Please install Tesseract:")
+        print("  macOS: brew install tesseract")
+        print("  Ubuntu: sudo apt-get install tesseract-ocr")
+        print("  Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki")
+        return False
+def create_directories():
+    """Create necessary directories"""
+    directories = ['uploads', 'results', 'static/results']
+    for directory in directories:
+        Path(directory).mkdir(parents=True, exist_ok=True)
+    print("✅ Directories created")
+def start_application():
+    """Start the Flask application"""
+    print("\n🚀 Starting PDF Comparison Tool...")
+    print("📱 The application will be available at: http://localhost:5000")
+    print("⏹️  Press Ctrl+C to stop the application")
+    print("-" * 50)
+    try:
+        # Start the Flask app
+        from app import app
+        app.run(debug=True, host='0.0.0.0', port=5000)
+    except KeyboardInterrupt:
+        print("\n👋 Application stopped by user")
+    except Exception as e:
+        print(f"❌ Error starting application: {e}")
+        return False
+    return True
+def main():
+    """Main startup function"""
+    print("=" * 50)
+    print("📄 PDF Comparison Tool")
+    print("=" * 50)
+    # Check requirements
+    if not check_python_version():
+        sys.exit(1)
+    if not check_dependencies():
+        sys.exit(1)
+    if not check_tesseract():
+        sys.exit(1)
+    # Create directories
+    create_directories()
+    # Ask user if they want to open browser
+    try:
+        response = input("\n🌐 Open browser automatically? (y/n): ").lower().strip()
+        if response in ['y', 'yes']:
+            # Wait a moment for the server to start
+            def open_browser():
+                time.sleep(2)
+                webbrowser.open('http://localhost:5000')
+            import threading
+            browser_thread = threading.Thread(target=open_browser)
+            browser_thread.daemon = True
+            browser_thread.start()
+    except KeyboardInterrupt:
+        print("\n👋 Setup cancelled by user")
+        sys.exit(0)
+    # Start the application
+    start_application()
+if __name__ == "__main__":
+    main()

ProofCheck/static/css/style.css ADDED Viewed

	@@ -0,0 +1,324 @@

+/* Custom styles for PDF Comparison Tool */
+body {
+    background-color: hsl(202, 68%, 79%);
+    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+}
+.navbar-brand {
+    font-weight: 600;
+    font-size: 1.5rem;
+}
+.card {
+    border: none;
+    border-radius: 12px;
+    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+    transition: transform 0.2s ease-in-out;
+}
+.card:hover {
+    transform: translateY(-2px);
+}
+.card-header {
+    border-radius: 12px 12px 0 0 !important;
+    border-bottom: none;
+    font-weight: 600;
+}
+.btn-primary {
+    background: linear-gradient(135deg, #007bff, #0056b3);
+    border: none;
+    border-radius: 8px;
+    font-weight: 600;
+    padding: 12px 24px;
+    transition: all 0.3s ease;
+}
+.btn-primary:hover {
+    background: linear-gradient(135deg, #0056b3, #004085);
+    transform: translateY(-1px);
+    box-shadow: 0 4px 8px rgba(0, 123, 255, 0.3);
+}
+.form-control {
+    border-radius: 8px;
+    border: 2px solid #e9ecef;
+    padding: 12px 16px;
+    transition: border-color 0.3s ease;
+}
+.form-control:focus {
+    border-color: #007bff;
+    box-shadow: 0 0 0 0.2rem rgba(0, 123, 255, 0.25);
+}
+/* Drag and Drop Styles */
+.drag-drop-zone {
+    position: relative;
+    border: 3px dashed #dee2e6;
+    border-radius: 12px;
+    padding: 40px 20px;
+    text-align: center;
+    background-color: #f8f9fa;
+    transition: all 0.3s ease;
+    cursor: pointer;
+    min-height: 200px;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+}
+.drag-drop-zone:hover {
+    border-color: #007bff;
+    background-color: #f0f8ff;
+}
+.drag-drop-zone.drag-over {
+    border-color: #28a745;
+    background-color: #f0fff0;
+    transform: scale(1.02);
+}
+.drag-drop-zone.has-file {
+    border-color: #28a745;
+    background-color: #f0fff0;
+}
+.drag-drop-content {
+    pointer-events: none;
+    z-index: 1;
+}
+.drag-drop-text {
+    font-size: 1.1rem;
+    font-weight: 600;
+    color: #495057;
+    margin-bottom: 8px;
+}
+.drag-drop-hint {
+    font-size: 0.9rem;
+    color: #6c757d;
+    margin-bottom: 0;
+}
+.drag-drop-input {
+    position: absolute;
+    top: 0;
+    left: 0;
+    width: 100%;
+    height: 100%;
+    opacity: 0;
+    cursor: pointer;
+    z-index: 2;
+}
+.drag-drop-zone .file-info {
+    display: none;
+    margin-top: 15px;
+}
+.drag-drop-zone.has-file .file-info {
+    display: block;
+}
+.drag-drop-zone.has-file .drag-drop-content {
+    display: none;
+}
+.file-info {
+    background: rgba(40, 167, 69, 0.1);
+    border: 1px solid #28a745;
+    border-radius: 8px;
+    padding: 10px;
+    margin-top: 10px;
+}
+.file-info i {
+    color: #28a745;
+    margin-right: 8px;
+}
+.nav-tabs .nav-link {
+    border: none;
+    border-radius: 8px 8px 0 0;
+    color: #6c757d;
+    font-weight: 500;
+    padding: 12px 20px;
+    transition: all 0.3s ease;
+}
+.nav-tabs .nav-link:hover {
+    color: #007bff;
+    background-color: #f8f9fa;
+}
+.nav-tabs .nav-link.active {
+    background-color: #007bff;
+    color: white;
+    border: none;
+}
+.alert {
+    border-radius: 8px;
+    border: none;
+    font-weight: 500;
+}
+.spinner-border {
+    width: 3rem;
+    height: 3rem;
+}
+.progress {
+    height: 8px;
+    border-radius: 4px;
+}
+.progress-bar {
+    border-radius: 4px;
+}
+/* Comparison results styling */
+.comparison-image {
+    max-width: 100%;
+    height: auto;
+    border-radius: 8px;
+    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+    margin: 10px 0;
+}
+.difference-box {
+    border: 3px solid #dc3545;
+    border-radius: 4px;
+    position: relative;
+}
+.difference-box::after {
+    content: "Difference";
+    position: absolute;
+    top: -10px;
+    left: 10px;
+    background: #dc3545;
+    color: white;
+    padding: 2px 8px;
+    border-radius: 4px;
+    font-size: 12px;
+    font-weight: bold;
+}
+/* Table styling */
+.table {
+    border-radius: 8px;
+    overflow: hidden;
+    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+}
+.table thead th {
+    background-color: #f8f9fa;
+    border-bottom: 2px solid #dee2e6;
+    font-weight: 600;
+    color: #495057;
+}
+.table tbody tr:hover {
+    background-color: #f8f9fa;
+}
+/* Badge styling */
+.badge {
+    font-size: 0.8em;
+    padding: 6px 10px;
+    border-radius: 6px;
+}
+.badge-danger {
+    background-color: #dc3545;
+}
+.badge-warning {
+    background-color: #ffc107;
+    color: #212529;
+}
+.badge-success {
+    background-color: #28a745;
+}
+.badge-info {
+    background-color: #17a2b8;
+}
+/* Responsive design */
+@media (max-width: 768px) {
+    .container {
+        padding: 0 15px;
+    }
+    .card {
+        margin-bottom: 20px;
+    }
+    .nav-tabs .nav-link {
+        padding: 8px 12px;
+        font-size: 14px;
+    }
+    .btn-lg {
+        padding: 10px 20px;
+        font-size: 16px;
+    }
+    .drag-drop-zone {
+        min-height: 150px;
+        padding: 30px 15px;
+    }
+    .drag-drop-text {
+        font-size: 1rem;
+    }
+}
+/* Loading animation */
+@keyframes pulse {
+    0% { opacity: 1; }
+    50% { opacity: 0.5; }
+    100% { opacity: 1; }
+}
+.loading-pulse {
+    animation: pulse 1.5s infinite;
+}
+/* Custom scrollbar */
+::-webkit-scrollbar {
+    width: 8px;
+}
+::-webkit-scrollbar-track {
+    background: #f1f1f1;
+    border-radius: 4px;
+}
+::-webkit-scrollbar-thumb {
+    background: #c1c1c1;
+    border-radius: 4px;
+}
+::-webkit-scrollbar-thumb:hover {
+    background: #a8a8a8;
+}
+/* Print styles */
+@media print {
+    .navbar, .btn, .nav-tabs {
+        display: none !important;
+    }
+    .card {
+        box-shadow: none !important;
+        border: 1px solid #dee2e6 !important;
+    }
+}

ProofCheck/static/js/script.js ADDED Viewed

	@@ -0,0 +1,353 @@

+// PDF Comparison Tool JavaScript
+document.addEventListener('DOMContentLoaded', function() {
+    const uploadForm = document.getElementById('uploadForm');
+    const loadingSection = document.getElementById('loadingSection');
+    const resultsSection = document.getElementById('resultsSection');
+    const errorSection = document.getElementById('errorSection');
+    const errorMessage = document.getElementById('errorMessage');
+    // Initialize drag and drop zones
+    initializeDragAndDrop('dragZone1', 'pdf1');
+    initializeDragAndDrop('dragZone2', 'pdf2');
+    // Handle form submission
+    uploadForm.addEventListener('submit', function(e) {
+        e.preventDefault();
+        const formData = new FormData(uploadForm);
+        const pdf1 = document.getElementById('pdf1').files[0];
+        const pdf2 = document.getElementById('pdf2').files[0];
+        // Validate files
+        if (!pdf1 || !pdf2) {
+            showError('Please select both PDF files.');
+            return;
+        }
+        if (!pdf1.name.toLowerCase().endsWith('.pdf') || !pdf2.name.toLowerCase().endsWith('.pdf')) {
+            showError('Please select valid PDF files.');
+            return;
+        }
+        // Show loading
+        showLoading();
+        hideError();
+        // Submit form via AJAX
+        fetch('/upload', {
+            method: 'POST',
+            body: formData
+        })
+        .then(response => response.json())
+        .then(data => {
+            hideLoading();
+            if (data.success) {
+                displayResults(data.results);
+            } else {
+                showError(data.error || 'An error occurred during comparison.');
+            }
+        })
+        .catch(error => {
+            hideLoading();
+            showError('Network error: ' + error.message);
+        });
+    });
+    function initializeDragAndDrop(zoneId, inputId) {
+        const zone = document.getElementById(zoneId);
+        const input = document.getElementById(inputId);
+        if (!zone || !input) return;
+        // Create file info display
+        const fileInfo = document.createElement('div');
+        fileInfo.className = 'file-info';
+        fileInfo.innerHTML = '<i class="fas fa-file-pdf"></i><span class="file-name"></span>';
+        zone.appendChild(fileInfo);
+        // Drag and drop events
+        zone.addEventListener('dragover', function(e) {
+            e.preventDefault();
+            e.stopPropagation();
+            zone.classList.add('drag-over');
+        });
+        zone.addEventListener('dragleave', function(e) {
+            e.preventDefault();
+            e.stopPropagation();
+            zone.classList.remove('drag-over');
+        });
+        zone.addEventListener('drop', function(e) {
+            e.preventDefault();
+            e.stopPropagation();
+            zone.classList.remove('drag-over');
+            const files = e.dataTransfer.files;
+            if (files.length > 0) {
+                const file = files[0];
+                if (file.type === 'application/pdf' || file.name.toLowerCase().endsWith('.pdf')) {
+                    handleFileSelect(file, input, zone);
+                } else {
+                    showError('Please select a valid PDF file.');
+                }
+            }
+        });
+        // Click to browse
+        zone.addEventListener('click', function(e) {
+            if (e.target !== input) {
+                input.click();
+            }
+        });
+        // File input change
+        input.addEventListener('change', function(e) {
+            const file = e.target.files[0];
+            if (file) {
+                handleFileSelect(file, input, zone);
+            }
+        });
+    }
+    function handleFileSelect(file, input, zone) {
+        // Update the file input
+        const dataTransfer = new DataTransfer();
+        dataTransfer.items.add(file);
+        input.files = dataTransfer.files;
+        // Update visual feedback
+        zone.classList.add('has-file');
+        const fileName = zone.querySelector('.file-name');
+        if (fileName) {
+            fileName.textContent = file.name;
+        }
+        // Update form text
+        const formText = zone.querySelector('.drag-drop-hint');
+        if (formText) {
+            formText.textContent = `Selected: ${file.name}`;
+        }
+    }
+    function showLoading() {
+        loadingSection.style.display = 'block';
+        resultsSection.style.display = 'none';
+        errorSection.style.display = 'none';
+    }
+    function hideLoading() {
+        loadingSection.style.display = 'none';
+    }
+    function showError(message) {
+        errorMessage.textContent = message;
+        errorSection.style.display = 'block';
+        resultsSection.style.display = 'none';
+    }
+    function hideError() {
+        errorSection.style.display = 'none';
+    }
+    function displayResults(results) {
+        resultsSection.style.display = 'block';
+        // Display visual comparison
+        displayVisualComparison(results);
+        // Display spelling issues
+        displaySpellingIssues(results);
+        // Display barcodes and QR codes
+        displayBarcodes(results);
+    }
+    function displayVisualComparison(results) {
+        const visualContent = document.getElementById('visualComparisonContent');
+        let html = '<div class="row">';
+        if (results.text_comparison && results.text_comparison.length > 0) {
+            results.text_comparison.forEach((page, index) => {
+                html += `
+                    <div class="col-12 mb-4">
+                        <h6 class="text-primary mb-3">Page ${page.page}</h6>
+                        <div class="row">
+                            <div class="col-md-6">
+                                <h6>PDF 1</h6>
+                                ${page.annotated_images && page.annotated_images.pdf1 ?
+                                    `<img src="/static/${page.annotated_images.pdf1}" class="comparison-image" alt="PDF 1 Page ${page.page}">` :
+                                    '<p class="text-muted">No differences detected</p>'
+                                }
+                            </div>
+                            <div class="col-md-6">
+                                <h6>PDF 2</h6>
+                                ${page.annotated_images && page.annotated_images.pdf2 ?
+                                    `<img src="/static/${page.annotated_images.pdf2}" class="comparison-image" alt="PDF 2 Page ${page.page}">` :
+                                    '<p class="text-muted">No differences detected</p>'
+                                }
+                            </div>
+                        </div>
+                        ${page.color_differences && page.color_differences.length > 0 ?
+                            `<div class="mt-3">
+                                <span class="badge badge-danger">${page.color_differences.length} color difference(s) detected</span>
+                            </div>` :
+                            '<div class="mt-3"><span class="badge badge-success">No color differences</span></div>'
+                        }
+                    </div>
+                `;
+            });
+        } else {
+            html += '<div class="col-12"><p class="text-muted">No visual comparison data available.</p></div>';
+        }
+        html += '</div>';
+        visualContent.innerHTML = html;
+    }
+    function displaySpellingIssues(results) {
+        const spellingContent = document.getElementById('spellingIssuesContent');
+        let html = '';
+        if (results.spelling_issues && results.spelling_issues.length > 0) {
+            html += `
+                <div class="table-responsive">
+                    <table class="table table-striped">
+                        <thead>
+                            <tr>
+                                <th>Word</th>
+                                <th>Original</th>
+                                <th>Misspelled In</th>
+                                <th>English Suggestions</th>
+                                <th>French Suggestions</th>
+                            </tr>
+                        </thead>
+                        <tbody>
+            `;
+            results.spelling_issues.forEach(issue => {
+                const misspelledIn = issue.misspelled_in ? issue.misspelled_in.join(', ') : 'Unknown';
+                const englishSuggestions = issue.suggestions.english ? issue.suggestions.english.join(', ') : 'None';
+                const frenchSuggestions = issue.suggestions.french ? issue.suggestions.french.join(', ') : 'None';
+                html += `
+                    <tr>
+                        <td><strong>${issue.word}</strong></td>
+                        <td><code>${issue.original_word}</code></td>
+                        <td><span class="badge badge-warning">${misspelledIn}</span></td>
+                        <td>${englishSuggestions}</td>
+                        <td>${frenchSuggestions}</td>
+                    </tr>
+                `;
+            });
+            html += `
+                        </tbody>
+                    </table>
+                </div>
+                <div class="mt-3">
+                    <span class="badge badge-warning">${results.spelling_issues.length} spelling issue(s) found</span>
+                </div>
+            `;
+        } else {
+            html = '<div class="alert alert-success"><i class="fas fa-check me-2"></i>No spelling issues detected.</div>';
+        }
+        spellingContent.innerHTML = html;
+    }
+    function displayBarcodes(results) {
+        const barcodesContent = document.getElementById('barcodesContent');
+        let html = '';
+        if (results.barcodes_qr_codes && results.barcodes_qr_codes.length > 0) {
+            html += `
+                <div class="table-responsive">
+                    <table class="table table-striped">
+                        <thead>
+                            <tr>
+                                <th>Type</th>
+                                <th>Data</th>
+                                <th>Stack Type</th>
+                                <th>Size</th>
+                                <th>Method</th>
+                                <th>Confidence</th>
+                                <th>GS1 Valid</th>
+                                <th>Position</th>
+                            </tr>
+                        </thead>
+                        <tbody>
+            `;
+            results.barcodes_qr_codes.forEach(barcode => {
+                const position = `(${barcode.rect.left}, ${barcode.rect.top}) - (${barcode.rect.left + barcode.rect.width}, ${barcode.rect.top + barcode.rect.height})`;
+                const stackType = barcode.stack_type || 'Single Stack';
+                const method = barcode.method || 'Unknown';
+                const confidence = barcode.confidence || 0;
+                const gs1Valid = barcode.gs1_validated ? 'Yes' : 'No';
+                const sizeCategory = barcode.size_category || 'Normal';
+                const resolution = barcode.resolution || '';
+                // Format DataBar Expanded data if available
+                let dataDisplay = barcode.data;
+                if (barcode.expanded_data) {
+                    dataDisplay = '<div><strong>Parsed Data:</strong><br>';
+                    for (const [key, value] of Object.entries(barcode.expanded_data)) {
+                        dataDisplay += `<span class="badge badge-info">${key}: ${value}</span> `;
+                    }
+                    dataDisplay += '</div>';
+                }
+                // Confidence color coding
+                let confidenceClass = 'badge-secondary';
+                if (confidence >= 80) confidenceClass = 'badge-success';
+                else if (confidence >= 60) confidenceClass = 'badge-warning';
+                else if (confidence >= 40) confidenceClass = 'badge-info';
+                // GS1 validation color
+                let gs1Class = barcode.gs1_validated ? 'badge-success' : 'badge-danger';
+                // Size category color
+                let sizeClass = 'badge-secondary';
+                if (sizeCategory === 'small') sizeClass = 'badge-warning';
+                else if (sizeCategory === 'tiny') sizeClass = 'badge-danger';
+                // Method display with resolution
+                let methodDisplay = method;
+                if (resolution) {
+                    methodDisplay += `<br><small>${resolution}</small>`;
+                }
+                html += `
+                    <tr>
+                        <td><span class="badge badge-info">${barcode.type}</span></td>
+                        <td>${dataDisplay}</td>
+                        <td><span class="badge badge-secondary">${stackType}</span></td>
+                        <td><span class="badge ${sizeClass}">${sizeCategory}</span></td>
+                        <td><span class="badge badge-dark">${methodDisplay}</span></td>
+                        <td><span class="badge ${confidenceClass}">${confidence}%</span></td>
+                        <td><span class="badge ${gs1Class}">${gs1Valid}</span></td>
+                        <td><small>${position}</small></td>
+                    </tr>
+                `;
+            });
+            html += `
+                        </tbody>
+                    </table>
+                </div>
+                <div class="mt-3">
+                    <span class="badge badge-info">${results.barcodes_qr_codes.length} barcode/QR code(s) detected</span>
+                    <span class="badge badge-success">Enhanced DataBar detection active</span>
+                    <span class="badge badge-warning">Small barcode detection active</span>
+                </div>
+            `;
+        } else {
+            html = '<div class="alert alert-info"><i class="fas fa-info-circle me-2"></i>No barcodes or QR codes detected.</div>';
+        }
+        barcodesContent.innerHTML = html;
+    }
+});

ProofCheck/templates/index.html ADDED Viewed

	@@ -0,0 +1,154 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>PDF Comparison Tool</title>
+    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css" rel="stylesheet">
+    <link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css" rel="stylesheet">
+    <link href="{{ url_for('static', filename='css/style.css') }}" rel="stylesheet">
+</head>
+<body>
+    <div class="container-fluid">
+        <div class="row">
+            <!-- Header -->
+            <div class="col-12">
+                <nav class="navbar navbar-expand-lg navbar-dark bg-primary">
+                    <div class="container">
+                        <a class="navbar-brand" href="#">
+                            <i class="fas fa-file-pdf me-2"></i>
+                            PDF Comparison Tool
+                        </a>
+                    </div>
+                </nav>
+            </div>
+        </div>
+        <div class="row mt-4">
+            <div class="col-12">
+                <div class="container">
+                    <!-- Upload Section -->
+                    <div class="card shadow-sm">
+                        <div class="card-header bg-light">
+                            <h5 class="mb-0">
+                                <i class="fas fa-upload me-2"></i>
+                                Upload PDF Files for Comparison
+                            </h5>
+                        </div>
+                        <div class="card-body">
+                            <form id="uploadForm" enctype="multipart/form-data">
+                                <div class="row">
+                                    <div class="col-md-6">
+                                        <div class="mb-3">
+                                            <label for="pdf1" class="form-label">First PDF File</label>
+                                            <div class="drag-drop-zone" id="dragZone1">
+                                                <div class="drag-drop-content">
+                                                    <i class="fas fa-cloud-upload-alt fa-3x text-muted mb-3"></i>
+                                                    <p class="drag-drop-text">Drag & drop PDF here or click to browse</p>
+                                                    <p class="drag-drop-hint">Select a PDF file for comparison</p>
+                                                </div>
+                                                <input type="file" class="form-control drag-drop-input" id="pdf1" name="pdf1" accept=".pdf" required>
+                                            </div>
+                                        </div>
+                                    </div>
+                                    <div class="col-md-6">
+                                        <div class="mb-3">
+                                            <label for="pdf2" class="form-label">Second PDF File</label>
+                                            <div class="drag-drop-zone" id="dragZone2">
+                                                <div class="drag-drop-content">
+                                                    <i class="fas fa-cloud-upload-alt fa-3x text-muted mb-3"></i>
+                                                    <p class="drag-drop-text">Drag & drop PDF here or click to browse</p>
+                                                    <p class="drag-drop-hint">Select a PDF file for comparison</p>
+                                                </div>
+                                                <input type="file" class="form-control drag-drop-input" id="pdf2" name="pdf2" accept=".pdf" required>
+                                            </div>
+                                        </div>
+                                    </div>
+                                </div>
+                                <div class="d-grid">
+                                    <button type="submit" class="btn btn-primary btn-lg">
+                                        <i class="fas fa-search me-2"></i>
+                                        Compare PDFs
+                                    </button>
+                                </div>
+                            </form>
+                        </div>
+                    </div>
+                    <!-- Loading Section -->
+                    <div id="loadingSection" class="card shadow-sm mt-4" style="display: none;">
+                        <div class="card-body text-center">
+                            <div class="spinner-border text-primary" role="status">
+                                <span class="visually-hidden">Loading...</span>
+                            </div>
+                            <p class="mt-3">Processing PDFs... This may take a few minutes.</p>
+                            <div class="progress mt-3">
+                                <div class="progress-bar progress-bar-striped progress-bar-animated" role="progressbar" style="width: 100%"></div>
+                            </div>
+                        </div>
+                    </div>
+                    <!-- Results Section -->
+                    <div id="resultsSection" class="mt-4" style="display: none;">
+                        <!-- Comparison Results Tabs -->
+                        <div class="card shadow-sm">
+                            <div class="card-header">
+                                <ul class="nav nav-tabs card-header-tabs" id="resultsTabs" role="tablist">
+                                    <li class="nav-item" role="presentation">
+                                        <button class="nav-link active" id="visual-tab" data-bs-toggle="tab" data-bs-target="#visual" type="button" role="tab">
+                                            <i class="fas fa-eye me-2"></i>Visual Comparison
+                                        </button>
+                                    </li>
+                                    <li class="nav-item" role="presentation">
+                                        <button class="nav-link" id="spelling-tab" data-bs-toggle="tab" data-bs-target="#spelling" type="button" role="tab">
+                                            <i class="fas fa-spell-check me-2"></i>Spelling Issues
+                                        </button>
+                                    </li>
+                                    <li class="nav-item" role="presentation">
+                                        <button class="nav-link" id="barcodes-tab" data-bs-toggle="tab" data-bs-target="#barcodes" type="button" role="tab">
+                                            <i class="fas fa-barcode me-2"></i>Barcodes & QR Codes
+                                        </button>
+                                    </li>
+                                </ul>
+                            </div>
+                            <div class="card-body">
+                                <div class="tab-content" id="resultsTabContent">
+                                    <!-- Visual Comparison Tab -->
+                                    <div class="tab-pane fade show active" id="visual" role="tabpanel">
+                                        <div id="visualComparisonContent">
+                                            <!-- Content will be populated by JavaScript -->
+                                        </div>
+                                    </div>
+                                    <!-- Spelling Issues Tab -->
+                                    <div class="tab-pane fade" id="spelling" role="tabpanel">
+                                        <div id="spellingIssuesContent">
+                                            <!-- Content will be populated by JavaScript -->
+                                        </div>
+                                    </div>
+                                    <!-- Barcodes Tab -->
+                                    <div class="tab-pane fade" id="barcodes" role="tabpanel">
+                                        <div id="barcodesContent">
+                                            <!-- Content will be populated by JavaScript -->
+                                        </div>
+                                    </div>
+                                </div>
+                            </div>
+                        </div>
+                    </div>
+                    <!-- Error Section -->
+                    <div id="errorSection" class="alert alert-danger mt-4" style="display: none;">
+                        <i class="fas fa-exclamation-triangle me-2"></i>
+                        <span id="errorMessage"></span>
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/js/bootstrap.bundle.min.js"></script>
+    <script src="{{ url_for('static', filename='js/script.js') }}"></script>
+</body>
+</html>

ProofCheck/test_setup.py ADDED Viewed

	@@ -0,0 +1,133 @@

+#!/usr/bin/env python3
+"""
+Test script to verify PDF Comparison Tool setup
+"""
+import sys
+import importlib
+def test_imports():
+    """Test if all required packages can be imported"""
+    required_packages = [
+        'flask',
+        'cv2',
+        'numpy',
+        'PIL',
+        'pytesseract',
+        'pdf2image',
+        'pyzbar',
+        'spellchecker',
+        'nltk',
+        'skimage',
+        'matplotlib',
+        'pandas'
+    ]
+    print("Testing package imports...")
+    failed_imports = []
+    for package in required_packages:
+        try:
+            importlib.import_module(package)
+            print(f"✓ {package}")
+        except ImportError as e:
+            print(f"✗ {package}: {e}")
+            failed_imports.append(package)
+    return failed_imports
+def test_tesseract():
+    """Test if Tesseract OCR is available"""
+    print("\nTesting Tesseract OCR...")
+    try:
+        import pytesseract
+        # Try to get Tesseract version
+        version = pytesseract.get_tesseract_version()
+        print(f"✓ Tesseract version: {version}")
+        return True
+    except Exception as e:
+        print(f"✗ Tesseract not found: {e}")
+        print("Please install Tesseract OCR:")
+        print("  macOS: brew install tesseract")
+        print("  Ubuntu: sudo apt-get install tesseract-ocr")
+        print("  Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki")
+        return False
+def test_pdf_comparator():
+    """Test if PDFComparator class can be instantiated"""
+    print("\nTesting PDFComparator...")
+    try:
+        from pdf_comparator import PDFComparator
+        comparator = PDFComparator()
+        print("✓ PDFComparator initialized successfully")
+        return True
+    except Exception as e:
+        print(f"✗ PDFComparator error: {e}")
+        return False
+def test_flask_app():
+    """Test if Flask app can be imported"""
+    print("\nTesting Flask application...")
+    try:
+        from app import app
+        print("✓ Flask app imported successfully")
+        return True
+    except Exception as e:
+        print(f"✗ Flask app error: {e}")
+        return False
+def main():
+    """Run all tests"""
+    print("PDF Comparison Tool - Setup Test")
+    print("=" * 40)
+    # Test imports
+    failed_imports = test_imports()
+    # Test Tesseract
+    tesseract_ok = test_tesseract()
+    # Test PDFComparator
+    comparator_ok = test_pdf_comparator()
+    # Test Flask app
+    flask_ok = test_flask_app()
+    # Summary
+    print("\n" + "=" * 40)
+    print("SETUP SUMMARY")
+    print("=" * 40)
+    if failed_imports:
+        print(f"✗ Missing packages: {', '.join(failed_imports)}")
+        print("Run: pip install -r requirements.txt")
+    else:
+        print("✓ All packages imported successfully")
+    if tesseract_ok:
+        print("✓ Tesseract OCR is available")
+    else:
+        print("✗ Tesseract OCR is not available")
+    if comparator_ok:
+        print("✓ PDFComparator is working")
+    else:
+        print("✗ PDFComparator has issues")
+    if flask_ok:
+        print("✓ Flask application is ready")
+    else:
+        print("✗ Flask application has issues")
+    # Overall status
+    all_ok = not failed_imports and tesseract_ok and comparator_ok and flask_ok
+    if all_ok:
+        print("\n🎉 Setup is complete! You can run the application with:")
+        print("   python app.py")
+    else:
+        print("\n⚠️  Setup is incomplete. Please fix the issues above.")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

README.md ADDED Viewed

	@@ -0,0 +1,203 @@

+# PDF Comparison Tool
+A comprehensive web-based tool for comparing PDF documents with advanced features including OCR validation, color difference detection, spelling verification, and barcode/QR code detection.
+## Features
+- **PDF Validation**: Ensures uploaded PDFs contain "50 Carroll" using OCR
+- **Color Difference Detection**: Identifies visual differences between PDFs and highlights them with red boxes
+- **Spelling Verification**: Checks text against both English and French dictionaries
+- **Barcode/QR Code Detection**: Automatically detects and reads barcodes and QR codes
+- **Visual Comparison**: Side-by-side comparison with annotated differences
+- **Modern Web Interface**: Responsive design with Bootstrap and custom styling
+## Requirements
+### System Requirements
+- Python 3.7 or higher
+- macOS, Linux, or Windows
+- Tesseract OCR engine (for text extraction)
+### Python Dependencies
+All dependencies are listed in `requirements.txt`:
+- Flask (web framework)
+- PyPDF2 (PDF processing)
+- pdf2image (PDF to image conversion)
+- OpenCV (image processing)
+- pytesseract (OCR)
+- pyzbar (barcode detection)
+- pyspellchecker (spelling verification)
+- scikit-image (image comparison)
+- Pillow (image manipulation)
+## Installation
+### 1. Install Tesseract OCR
+**macOS:**
+```bash
+brew install tesseract
+```
+**Ubuntu/Debian:**
+```bash
+sudo apt-get install tesseract-ocr
+```
+**Windows:**
+Download from [Tesseract GitHub](https://github.com/UB-Mannheim/tesseract/wiki)
+### 2. Install Python Dependencies
+```bash
+# Create virtual environment (recommended)
+python -m venv venv
+source venv/bin/activate  # On Windows: venv\Scripts\activate
+# Install dependencies
+pip install -r requirements.txt
+```
+### 3. Download Language Data (if needed)
+The application will automatically download required NLTK data on first run.
+## Usage
+### 1. Start the Application
+```bash
+python app.py
+```
+The application will start on `http://localhost:5000`
+### 2. Upload PDFs
+1. Open your web browser and navigate to `http://localhost:5000`
+2. Select two PDF files for comparison
+3. Both PDFs must contain "50 Carroll" for validation
+4. Click "Compare PDFs" to start the analysis
+### 3. View Results
+The comparison results are displayed in three tabs:
+- **Visual Comparison**: Side-by-side view with red boxes highlighting differences
+- **Spelling Issues**: Table of spelling errors with suggestions from English and French dictionaries
+- **Barcodes & QR Codes**: List of detected barcodes with their data and positions
+## File Structure
+```
+ProofCheck/
+├── app.py                 # Main Flask application
+├── pdf_comparator.py      # PDF comparison logic
+├── requirements.txt       # Python dependencies
+├── README.md             # This file
+├── templates/
+│   └── index.html        # Main web interface
+├── static/
+│   ├── css/
+│   │   └── style.css     # Custom styles
+│   ├── js/
+│   │   └── script.js     # Frontend JavaScript
+│   └── results/          # Generated comparison images
+├── uploads/              # Temporary uploaded files
+└── results/              # Comparison results JSON files
+```
+## How It Works
+### 1. PDF Validation
+- Converts PDF pages to images using `pdf2image`
+- Uses Tesseract OCR to extract text
+- Validates presence of "50 Carroll" in extracted text
+### 2. Color Difference Detection
+- Converts PDF pages to images
+- Resizes images to same dimensions
+- Uses structural similarity index (SSIM) to detect differences
+- Draws red rectangles around detected differences
+### 3. Spelling Verification
+- Extracts text using OCR
+- Splits text into individual words
+- Checks each word against English and French dictionaries
+- Provides spelling suggestions for incorrect words
+### 4. Barcode/QR Code Detection
+- Uses `pyzbar` library to detect barcodes and QR codes
+- Extracts data and position information
+- Displays results in organized table format
+## Configuration
+### Environment Variables
+- `FLASK_ENV`: Set to `development` for debug mode
+- `MAX_CONTENT_LENGTH`: Maximum file upload size (default: 16MB)
+### Customization
+- Modify `pdf_comparator.py` to change comparison algorithms
+- Update `static/css/style.css` for custom styling
+- Edit `templates/index.html` for interface changes
+## Troubleshooting
+### Common Issues
+1. **Tesseract not found**
+   - Ensure Tesseract is installed and in your system PATH
+   - On macOS, try: `brew install tesseract`
+2. **PDF processing errors**
+   - Check that PDFs are not corrupted
+   - Ensure PDFs contain readable text (not just images)
+3. **Memory issues with large PDFs**
+   - Reduce DPI in `pdf_comparator.py` (default: 200)
+   - Process PDFs page by page for very large documents
+4. **Spelling checker not working**
+   - Ensure internet connection for first run (downloads dictionary data)
+   - Check that `pyspellchecker` is properly installed
+### Performance Tips
+- Use smaller DPI values for faster processing
+- Limit PDF page count for large documents
+- Ensure sufficient RAM for image processing
+## Security Considerations
+- Uploaded files are stored temporarily and cleaned up
+- File size limits prevent DoS attacks
+- Input validation prevents malicious file uploads
+- Session-based file handling ensures isolation
+## Contributing
+1. Fork the repository
+2. Create a feature branch
+3. Make your changes
+4. Add tests if applicable
+5. Submit a pull request
+## License
+This project is open source and available under the MIT License.
+## Support
+For issues and questions:
+1. Check the troubleshooting section
+2. Review the code comments
+3. Create an issue on the repository
+## Future Enhancements
+- Support for more document formats
+- Advanced text comparison algorithms
+- Machine learning-based difference detection
+- Batch processing capabilities
+- Export functionality for comparison reports

app.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import os
+import uuid
+import json
+from flask import Flask, request, render_template, jsonify, send_file
+from werkzeug.utils import secure_filename
+from pdf_comparator import PDFComparator
+import tempfile
+import shutil
+app = Flask(__name__)
+app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024  # 16MB max file size
+app.config['UPLOAD_FOLDER'] = 'uploads'
+app.config['RESULTS_FOLDER'] = 'results'
+# Ensure directories exist
+os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
+os.makedirs(app.config['RESULTS_FOLDER'], exist_ok=True)
+ALLOWED_EXTENSIONS = {'pdf'}
+def allowed_file(filename):
+    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+@app.route('/')
+def index():
+    return render_template('index.html')
+@app.route('/upload', methods=['POST'])
+def upload_files():
+    if 'pdf1' not in request.files or 'pdf2' not in request.files:
+        return jsonify({'error': 'Both PDF files are required'}), 400
+    pdf1 = request.files['pdf1']
+    pdf2 = request.files['pdf2']
+    if pdf1.filename == '' or pdf2.filename == '':
+        return jsonify({'error': 'Both PDF files are required'}), 400
+    if not (allowed_file(pdf1.filename) and allowed_file(pdf2.filename)):
+        return jsonify({'error': 'Only PDF files are allowed'}), 400
+    # Create unique session directory
+    session_id = str(uuid.uuid4())
+    session_dir = os.path.join(app.config['UPLOAD_FOLDER'], session_id)
+    os.makedirs(session_dir, exist_ok=True)
+    # Save uploaded files
+    pdf1_path = os.path.join(session_dir, secure_filename(pdf1.filename))
+    pdf2_path = os.path.join(session_dir, secure_filename(pdf2.filename))
+    pdf1.save(pdf1_path)
+    pdf2.save(pdf2_path)
+    try:
+        # Initialize PDF comparator
+        comparator = PDFComparator()
+        # Perform comparison
+        results = comparator.compare_pdfs(pdf1_path, pdf2_path, session_id)
+        # Save results
+        results_path = os.path.join(app.config['RESULTS_FOLDER'], f'{session_id}_results.json')
+        with open(results_path, 'w') as f:
+            json.dump(results, f, indent=2)
+        return jsonify({
+            'success': True,
+            'session_id': session_id,
+            'results': results
+        })
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+@app.route('/results/<session_id>')
+def get_results(session_id):
+    results_path = os.path.join(app.config['RESULTS_FOLDER'], f'{session_id}_results.json')
+    if not os.path.exists(results_path):
+        return jsonify({'error': 'Results not found'}), 404
+    with open(results_path, 'r') as f:
+        results = json.load(f)
+    return jsonify(results)
+@app.route('/download/<session_id>/<filename>')
+def download_file(session_id, filename):
+    file_path = os.path.join(app.config['UPLOAD_FOLDER'], session_id, filename)
+    if not os.path.exists(file_path):
+        return jsonify({'error': 'File not found'}), 404
+    return send_file(file_path, as_attachment=True)
+if __name__ == '__main__':
+    app.run(debug=True, host='0.0.0.0', port=5000)

pdf_comparator.py ADDED Viewed

	@@ -0,0 +1,551 @@

+import os
+import cv2
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+import pytesseract
+from pdf2image import convert_from_path
+from pyzbar.pyzbar import decode
+from spellchecker import SpellChecker
+import nltk
+from skimage.metrics import structural_similarity as ssim
+from skimage import color
+import json
+import tempfile
+import shutil
+import unicodedata
+import regex as re
+# Domain whitelist for spell checking
+DOMAIN_WHITELIST = {
+    # units / abbreviations
+    "mg", "mg/g", "ml", "g", "thc", "cbd", "tcm", "mct",
+    # common packaging terms / bilingual words you expect
+    "gouttes", "tennir", "net", "zoom", "tytann", "dome", "drops",
+    # brand or proper names you want to ignore completely
+    "purified", "brands", "tytann", "dome", "drops",
+}
+# lowercase everything in whitelist for comparisons
+DOMAIN_WHITELIST = {w.lower() for w in DOMAIN_WHITELIST}
+# Safe import for regex with fallback
+try:
+    import regex as _re
+    _USE_REGEX = True
+except ImportError:
+    import re as _re
+    _USE_REGEX = False
+TOKEN_PATTERN = r"(?:\p{L})(?:[\p{L}'-]{1,})" if _USE_REGEX else r"[A-Za-z][A-Za-z'-]{1,}"
+class PDFComparator:
+    def __init__(self):
+        # Initialize spell checkers for English and French
+        self.english_spellchecker = SpellChecker(language='en')
+        self.french_spellchecker = SpellChecker(language='fr')
+        # Add domain whitelist to spell checkers
+        for w in DOMAIN_WHITELIST:
+            self.english_spellchecker.word_frequency.add(w)
+            self.french_spellchecker.word_frequency.add(w)
+        # Download required NLTK data
+        try:
+            nltk.data.find('tokenizers/punkt')
+        except LookupError:
+            nltk.download('punkt')
+    def enhance_image_for_tiny_fonts(self, image):
+        """Enhance image specifically for tiny font OCR"""
+        try:
+            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+            clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
+            enhanced = clahe.apply(gray)
+            denoised = cv2.bilateralFilter(enhanced, 9, 75, 75)
+            gaussian = cv2.GaussianBlur(denoised, (0, 0), 2.0)
+            unsharp_mask = cv2.addWeighted(denoised, 1.5, gaussian, -0.5, 0)
+            thresh = cv2.adaptiveThreshold(unsharp_mask, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
+            kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
+            cleaned = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
+            return cleaned
+        except Exception as e:
+            print(f"Error enhancing image for tiny fonts: {str(e)}")
+            return image
+    def create_inverted_image(self, image):
+        """Create inverted image for white text on dark backgrounds"""
+        try:
+            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+            inverted = cv2.bitwise_not(gray)
+            clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
+            enhanced = clahe.apply(inverted)
+            _, thresh = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+            return thresh
+        except Exception as e:
+            print(f"Error creating inverted image: {str(e)}")
+            return image
+    def extract_color_channels(self, image):
+        """Extract text from different color channels"""
+        try:
+            # RGB channels
+            b, g, r = cv2.split(image)
+            # HSV channels
+            hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
+            h, s, v = cv2.split(hsv)
+            # LAB channels
+            lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
+            l, a, b_lab = cv2.split(lab)
+            channels = [r, g, b, v, l]
+            texts = []
+            for channel in channels:
+                _, thresh = cv2.threshold(channel, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+                text = pytesseract.image_to_string(thresh, config='--oem 3 --psm 6')
+                if text.strip():
+                    texts.append(text)
+            return texts
+        except Exception as e:
+            print(f"Error extracting color channels: {str(e)}")
+            return []
+    def create_edge_enhanced_image(self, image):
+        """Create edge-enhanced image for text detection"""
+        try:
+            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+            edges = cv2.Canny(gray, 50, 150)
+            kernel = np.ones((2,2), np.uint8)
+            dilated = cv2.dilate(edges, kernel, iterations=1)
+            inverted = cv2.bitwise_not(dilated)
+            return inverted
+        except Exception as e:
+            print(f"Error creating edge-enhanced image: {str(e)}")
+            return image
+    def ocr_with_multiple_configs(self, image):
+        """Run OCR with multiple configurations and return best result"""
+        configs = [
+            '--oem 3 --psm 6',  # Uniform block of text
+            '--oem 3 --psm 8',  # Single word
+            '--oem 3 --psm 13', # Raw line
+            '--oem 1 --psm 6',  # LSTM + Uniform block
+            '--oem 3 --psm 3',  # Fully automatic page segmentation
+        ]
+        best_text = ""
+        best_length = 0
+        for config in configs:
+            try:
+                text = pytesseract.image_to_string(image, config=config)
+                if len(text.strip()) > best_length:
+                    best_text = text
+                    best_length = len(text.strip())
+            except Exception as e:
+                print(f"OCR config {config} failed: {str(e)}")
+                continue
+        return best_text
+    def extract_multi_color_text(self, image):
+        """Extract text using multiple preprocessing methods"""
+        texts = []
+        # Method 1: Standard black text
+        enhanced = self.enhance_image_for_tiny_fonts(image)
+        text1 = self.ocr_with_multiple_configs(enhanced)
+        if text1.strip():
+            texts.append(text1)
+        # Method 2: Inverted text (white on dark)
+        inverted = self.create_inverted_image(image)
+        text2 = self.ocr_with_multiple_configs(inverted)
+        if text2.strip():
+            texts.append(text2)
+        # Method 3: Color channel separation
+        color_texts = self.extract_color_channels(image)
+        texts.extend(color_texts)
+        # Method 4: Edge-enhanced
+        edge_enhanced = self.create_edge_enhanced_image(image)
+        text4 = self.ocr_with_multiple_configs(edge_enhanced)
+        if text4.strip():
+            texts.append(text4)
+        # Combine all texts and return the best one
+        combined_text = " ".join(texts)
+        return combined_text
+    def validate_pdf(self, pdf_path):
+        """Validate that PDF contains '50 Carroll' using enhanced OCR"""
+        try:
+            # Multiple DPI settings for better detection
+            dpi_settings = [200, 300, 400]
+            for dpi in dpi_settings:
+                try:
+                    images = convert_from_path(pdf_path, dpi=dpi)
+                    for page_num, image in enumerate(images):
+                        # Convert PIL image to OpenCV format
+                        opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+                        # Enhanced text extraction
+                        text = self.extract_multi_color_text(opencv_image)
+                        # Check for "50 Carroll" with multiple patterns
+                        patterns = ["50 Carroll", "50 carroll", "50Carroll", "50 carroll"]
+                        for pattern in patterns:
+                            if pattern in text:
+                                return True
+                        # Also try standard OCR as fallback
+                        standard_text = pytesseract.image_to_string(opencv_image, config='--oem 3 --psm 6')
+                        for pattern in patterns:
+                            if pattern in standard_text:
+                                return True
+                except Exception as e:
+                    print(f"DPI {dpi} failed: {str(e)}")
+                    continue
+            return False
+        except Exception as e:
+            raise Exception(f"Error validating PDF: {str(e)}")
+    def extract_text_from_pdf(self, pdf_path):
+        """Extract text from PDF using enhanced OCR"""
+        try:
+            # Use higher DPI for better text extraction
+            images = convert_from_path(pdf_path, dpi=300)
+            all_text = []
+            for page_num, image in enumerate(images):
+                opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+                # Enhanced text extraction
+                text = self.extract_multi_color_text(opencv_image)
+                # Fallback to standard OCR if enhanced extraction is empty
+                if not text.strip():
+                    text = pytesseract.image_to_string(opencv_image, config='--oem 3 --psm 6')
+                all_text.append({
+                    'page': page_num + 1,
+                    'text': text,
+                    'image': image
+                })
+            return all_text
+        except Exception as e:
+            raise Exception(f"Error extracting text from PDF: {str(e)}")
+    def _likely_french(self, token: str) -> bool:
+        """Helper function to guess if a token is likely French"""
+        if _USE_REGEX:
+            # any Latin letter outside ASCII => probably FR (é, è, ç…)
+            return bool(_re.search(r"[\p{Letter}&&\p{Latin}&&[^A-Za-z]]", token))
+        # fallback: any non-ascii letter
+        return any((not ('a' <= c.lower() <= 'z')) and c.isalpha() for c in token)
+    def check_spelling(self, text):
+        """
+        Robust EN/FR spell check:
+        - Unicode-aware tokens (keeps accents)
+        - Normalizes curly quotes/ligatures
+        - Heuristic per-token language (accented => FR; else EN)
+        - Flags if unknown in its likely language (not both)
+        """
+        try:
+            text = unicodedata.normalize("NFKC", text)
+            text = text.replace("'", "'").replace(""", '"').replace(""", '"')
+            tokens = _re.findall(TOKEN_PATTERN, text, flags=_re.UNICODE if _USE_REGEX else 0)
+            issues = []
+            for raw in tokens:
+                t = raw.lower()
+                # skip very short, short ALL-CAPS acronyms, and whitelisted terms
+                if len(t) < 3:
+                    continue
+                if raw.isupper() and len(raw) <= 3:  # Changed from <=5 to <=3
+                    continue
+                if t in DOMAIN_WHITELIST:
+                    continue
+                miss_en = t in self.english_spellchecker.unknown([t])
+                miss_fr = t in self.french_spellchecker.unknown([t])
+                use_fr = self._likely_french(raw)
+                # Prefer the likely language, but fall back to "either language unknown"
+                if (use_fr and miss_fr) or ((not use_fr) and miss_en) or (miss_en and miss_fr):
+                    issues.append({
+                        "word": raw,
+                        "lang": "fr" if use_fr else "en",
+                        "suggestions_en": list(self.english_spellchecker.candidates(t))[:3],
+                        "suggestions_fr": list(self.french_spellchecker.candidates(t))[:3],
+                    })
+            return issues
+        except Exception as e:
+            print(f"Error checking spelling: {e}")
+            return []
+    def annotate_spelling_errors_on_image(self, pil_image, misspelled):
+        """
+        Draw one red rectangle around each misspelled token using Tesseract word boxes.
+        'misspelled' must be a list of dicts with 'word' keys (from check_spelling).
+        """
+        if not misspelled:
+            return pil_image
+        def _norm(s: str) -> str:
+            return unicodedata.normalize("NFKC", s).replace("'","'").strip(".,:;!?)(").lower()
+        miss_set = {_norm(m["word"]) for m in misspelled}
+        img = pil_image
+        try:
+            data = pytesseract.image_to_data(
+                img,
+                lang="eng+fra",  # Added lang parameter
+                config="--oem 3 --psm 6",
+                output_type=pytesseract.Output.DICT,
+            )
+        except Exception as e:
+            print("image_to_data failed:", e)
+            return img
+        draw = ImageDraw.Draw(img)
+        n = len(data.get("text", []))
+        for i in range(n):
+            word = (data["text"][i] or "").strip()
+            if not word:
+                continue
+            clean = _norm(word)  # Used _norm function
+            if clean and clean in miss_set:
+                x, y, w, h = data["left"][i], data["top"][i], data["width"][i], data["height"][i]
+                draw.rectangle([x, y, x + w, y + h], outline="red", width=4)
+        return img
+    def detect_barcodes_qr_codes(self, image):
+        """Detect and decode barcodes and QR codes"""
+        try:
+            # Convert PIL image to OpenCV format
+            opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+            # Decode barcodes and QR codes
+            decoded_objects = decode(opencv_image)
+            barcodes = []
+            for obj in decoded_objects:
+                barcode_info = {
+                    'type': obj.type,
+                    'data': obj.data.decode('utf-8'),
+                    'rect': obj.rect
+                }
+                barcodes.append(barcode_info)
+            return barcodes
+        except Exception as e:
+            print(f"Error detecting barcodes: {str(e)}")
+            return []
+    def compare_colors(self, image1, image2):
+        """Compare colors between two images and return differences"""
+        try:
+            # Convert images to same size
+            img1 = np.array(image1)
+            img2 = np.array(image2)
+            # Resize images to same dimensions
+            height = min(img1.shape[0], img2.shape[0])
+            width = min(img1.shape[1], img2.shape[1])
+            img1_resized = cv2.resize(img1, (width, height))
+            img2_resized = cv2.resize(img2, (width, height))
+            # Convert to grayscale for comparison
+            gray1 = cv2.cvtColor(img1_resized, cv2.COLOR_RGB2GRAY)
+            gray2 = cv2.cvtColor(img2_resized, cv2.COLOR_RGB2GRAY)
+            # Calculate structural similarity
+            (score, diff) = ssim(gray1, gray2, full=True)
+            # Convert difference to binary mask
+            diff = (diff * 255).astype("uint8")
+            thresh = cv2.threshold(diff, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
+            # Find contours of differences
+            contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            color_differences = []
+            for contour in contours:
+                if cv2.contourArea(contour) > 100:  # Filter small differences
+                    x, y, w, h = cv2.boundingRect(contour)
+                    color_differences.append({
+                        'x': x,
+                        'y': y,
+                        'width': w,
+                        'height': h,
+                        'area': cv2.contourArea(contour)
+                    })
+            return color_differences
+        except Exception as e:
+            print(f"Error comparing colors: {str(e)}")
+            return []
+    def create_annotated_image(self, image, differences, output_path):
+        """Create annotated image with red boxes around differences"""
+        try:
+            # Create a copy of the image
+            annotated_image = image.copy()
+            draw = ImageDraw.Draw(annotated_image)
+            # Draw red rectangles around differences
+            for diff in differences:
+                x, y, w, h = diff['x'], diff['y'], diff['width'], diff['height']
+                draw.rectangle([x, y, x + w, y + h], outline='red', width=3)
+            # Save annotated image
+            annotated_image.save(output_path)
+        except Exception as e:
+            print(f"Error creating annotated image: {str(e)}")
+    def compare_pdfs(self, pdf1_path, pdf2_path, session_id):
+        """Main comparison function"""
+        try:
+            # Validate both PDFs contain "50 Carroll"
+            if not self.validate_pdf(pdf1_path):
+                raise Exception("INVALID DOCUMENT")
+            if not self.validate_pdf(pdf2_path):
+                raise Exception("INVALID DOCUMENT")
+            # Extract text and images from both PDFs
+            pdf1_data = self.extract_text_from_pdf(pdf1_path)
+            pdf2_data = self.extract_text_from_pdf(pdf2_path)
+            # Initialize results
+            results = {
+                'session_id': session_id,
+                'validation': {
+                    'pdf1_valid': True,
+                    'pdf2_valid': True,
+                    'validation_text': '50 Carroll'
+                },
+                'text_comparison': [],
+                'spelling_issues': [],
+                'barcodes_qr_codes': [],
+                'color_differences': [],
+                'annotated_images': []
+            }
+            # Compare text and check spelling
+            for i, (page1, page2) in enumerate(zip(pdf1_data, pdf2_data)):
+                page_results = {
+                    'page': i + 1,
+                    'text_differences': [],
+                    'spelling_issues_pdf1': [],
+                    'spelling_issues_pdf2': [],
+                    'barcodes_pdf1': [],
+                    'barcodes_pdf2': [],
+                    'color_differences': []
+                }
+                # Check spelling for both PDFs
+                page_results['spelling_issues_pdf1'] = self.check_spelling(page1['text'])
+                page_results['spelling_issues_pdf2'] = self.check_spelling(page2['text'])
+                # Create spelling-only annotated images (one box per error)
+                spell_dir = f'static/results/{session_id}'
+                os.makedirs(spell_dir, exist_ok=True)
+                spell_img1 = page1['image'].copy()
+                spell_img2 = page2['image'].copy()
+                spell_img1 = self.annotate_spelling_errors_on_image(spell_img1, page_results['spelling_issues_pdf1'])
+                spell_img2 = self.annotate_spelling_errors_on_image(spell_img2, page_results['spelling_issues_pdf2'])
+                spell_path1 = f'{spell_dir}/page_{i+1}_pdf1_spelling.png'
+                spell_path2 = f'{spell_dir}/page_{i+1}_pdf2_spelling.png'
+                spell_img1.save(spell_path1)
+                spell_img2.save(spell_path2)
+                # Detect barcodes and QR codes
+                page_results['barcodes_pdf1'] = self.detect_barcodes_qr_codes(page1['image'])
+                page_results['barcodes_pdf2'] = self.detect_barcodes_qr_codes(page2['image'])
+                # Compare colors
+                color_diffs = self.compare_colors(page1['image'], page2['image'])
+                page_results['color_differences'] = color_diffs
+                # Create annotated images
+                if color_diffs:
+                    output_dir = f'static/results/{session_id}'
+                    os.makedirs(output_dir, exist_ok=True)
+                    annotated_path1 = f'{output_dir}/page_{i+1}_pdf1_annotated.png'
+                    annotated_path2 = f'{output_dir}/page_{i+1}_pdf2_annotated.png'
+                    self.create_annotated_image(page1['image'], color_diffs, annotated_path1)
+                    self.create_annotated_image(page2['image'], color_diffs, annotated_path2)
+                    page_results['annotated_images'] = {
+                        'pdf1': f'results/{session_id}/page_{i+1}_pdf1_annotated.png',
+                        'pdf2': f'results/{session_id}/page_{i+1}_pdf2_annotated.png',
+                        'pdf1_spelling': f'results/{session_id}/page_{i+1}_pdf1_spelling.png',
+                        'pdf2_spelling': f'results/{session_id}/page_{i+1}_pdf2_spelling.png'
+                    }
+                else:
+                    # If no color differences, still save spelling images
+                    page_results['annotated_images'] = {
+                        'pdf1_spelling': f'results/{session_id}/page_{i+1}_pdf1_spelling.png',
+                        'pdf2_spelling': f'results/{session_id}/page_{i+1}_pdf2_spelling.png'
+                    }
+                # Add spelling issues summary to text differences
+                if page_results['spelling_issues_pdf1'] or page_results['spelling_issues_pdf2']:
+                    page_results['text_differences'].append({
+                        'type': 'spelling',
+                        'pdf1_issues': len(page_results['spelling_issues_pdf1']),
+                        'pdf2_issues': len(page_results['spelling_issues_pdf2']),
+                        'details': {
+                            'pdf1': [issue['word'] for issue in page_results['spelling_issues_pdf1']],
+                            'pdf2': [issue['word'] for issue in page_results['spelling_issues_pdf2']]
+                        }
+                    })
+                results['text_comparison'].append(page_results)
+            # Aggregate spelling issues
+            all_spelling_issues = []
+            for page in results['text_comparison']:
+                all_spelling_issues.extend(page['spelling_issues_pdf1'])
+                all_spelling_issues.extend(page['spelling_issues_pdf2'])
+            results['spelling_issues'] = all_spelling_issues
+            # Aggregate barcodes and QR codes
+            all_barcodes = []
+            for page in results['text_comparison']:
+                all_barcodes.extend(page['barcodes_pdf1'])
+                all_barcodes.extend(page['barcodes_pdf2'])
+            results['barcodes_qr_codes'] = all_barcodes
+            return results
+        except Exception as e:
+            raise Exception(f"Error comparing PDFs: {str(e)}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+Flask==2.3.3
+Werkzeug==2.3.7
+PyPDF2==3.0.1
+pdf2image==1.16.3
+Pillow==10.0.1
+opencv-python==4.8.1.78
+pytesseract==0.3.10
+pyzbar==0.1.9
+pyspellchecker==0.7.2
+nltk==3.8.1
+numpy==1.24.3
+scikit-image==0.21.0
+matplotlib==3.7.2
+pandas==2.0.3
+reportlab==4.0.4
+regex==2023.10.3

run.py ADDED Viewed

	@@ -0,0 +1,123 @@

+#!/usr/bin/env python3
+"""
+Startup script for PDF Comparison Tool
+"""
+import os
+import sys
+import subprocess
+import webbrowser
+import time
+from pathlib import Path
+def check_python_version():
+    """Check if Python version is compatible"""
+    if sys.version_info < (3, 7):
+        print("❌ Python 3.7 or higher is required")
+        print(f"Current version: {sys.version}")
+        return False
+    print(f"✅ Python {sys.version.split()[0]} is compatible")
+    return True
+def check_dependencies():
+    """Check if required dependencies are installed"""
+    try:
+        import flask
+        import cv2
+        import numpy
+        import PIL
+        import pytesseract
+        import pdf2image
+        import pyzbar
+        import spellchecker
+        import nltk
+        import skimage
+        print("✅ All Python dependencies are installed")
+        return True
+    except ImportError as e:
+        print(f"❌ Missing dependency: {e}")
+        print("Please run: pip install -r requirements.txt")
+        return False
+def check_tesseract():
+    """Check if Tesseract OCR is installed"""
+    try:
+        import pytesseract
+        pytesseract.get_tesseract_version()
+        print("✅ Tesseract OCR is available")
+        return True
+    except Exception as e:
+        print(f"❌ Tesseract OCR not found: {e}")
+        print("Please install Tesseract:")
+        print("  macOS: brew install tesseract")
+        print("  Ubuntu: sudo apt-get install tesseract-ocr")
+        print("  Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki")
+        return False
+def create_directories():
+    """Create necessary directories"""
+    directories = ['uploads', 'results', 'static/results']
+    for directory in directories:
+        Path(directory).mkdir(parents=True, exist_ok=True)
+    print("✅ Directories created")
+def start_application():
+    """Start the Flask application"""
+    print("\n🚀 Starting PDF Comparison Tool...")
+    print("📱 The application will be available at: http://localhost:5000")
+    print("⏹️  Press Ctrl+C to stop the application")
+    print("-" * 50)
+    try:
+        # Start the Flask app
+        from app import app
+        app.run(debug=True, host='0.0.0.0', port=5000)
+    except KeyboardInterrupt:
+        print("\n👋 Application stopped by user")
+    except Exception as e:
+        print(f"❌ Error starting application: {e}")
+        return False
+    return True
+def main():
+    """Main startup function"""
+    print("=" * 50)
+    print("📄 PDF Comparison Tool")
+    print("=" * 50)
+    # Check requirements
+    if not check_python_version():
+        sys.exit(1)
+    if not check_dependencies():
+        sys.exit(1)
+    if not check_tesseract():
+        sys.exit(1)
+    # Create directories
+    create_directories()
+    # Ask user if they want to open browser
+    try:
+        response = input("\n🌐 Open browser automatically? (y/n): ").lower().strip()
+        if response in ['y', 'yes']:
+            # Wait a moment for the server to start
+            def open_browser():
+                time.sleep(2)
+                webbrowser.open('http://localhost:5000')
+            import threading
+            browser_thread = threading.Thread(target=open_browser)
+            browser_thread.daemon = True
+            browser_thread.start()
+    except KeyboardInterrupt:
+        print("\n👋 Setup cancelled by user")
+        sys.exit(0)
+    # Start the application
+    start_application()
+if __name__ == "__main__":
+    main()

static/css/style.css ADDED Viewed

	@@ -0,0 +1,228 @@

+/* Custom styles for PDF Comparison Tool */
+body {
+    background-color: #f8f9fa;
+    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+}
+.navbar-brand {
+    font-weight: 600;
+    font-size: 1.5rem;
+}
+.card {
+    border: none;
+    border-radius: 12px;
+    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+    transition: transform 0.2s ease-in-out;
+}
+.card:hover {
+    transform: translateY(-2px);
+}
+.card-header {
+    border-radius: 12px 12px 0 0 !important;
+    border-bottom: none;
+    font-weight: 600;
+}
+.btn-primary {
+    background: linear-gradient(135deg, #007bff, #0056b3);
+    border: none;
+    border-radius: 8px;
+    font-weight: 600;
+    padding: 12px 24px;
+    transition: all 0.3s ease;
+}
+.btn-primary:hover {
+    background: linear-gradient(135deg, #0056b3, #004085);
+    transform: translateY(-1px);
+    box-shadow: 0 4px 8px rgba(0, 123, 255, 0.3);
+}
+.form-control {
+    border-radius: 8px;
+    border: 2px solid #e9ecef;
+    padding: 12px 16px;
+    transition: border-color 0.3s ease;
+}
+.form-control:focus {
+    border-color: #007bff;
+    box-shadow: 0 0 0 0.2rem rgba(0, 123, 255, 0.25);
+}
+.nav-tabs .nav-link {
+    border: none;
+    border-radius: 8px 8px 0 0;
+    color: #6c757d;
+    font-weight: 500;
+    padding: 12px 20px;
+    transition: all 0.3s ease;
+}
+.nav-tabs .nav-link:hover {
+    color: #007bff;
+    background-color: #f8f9fa;
+}
+.nav-tabs .nav-link.active {
+    background-color: #007bff;
+    color: white;
+    border: none;
+}
+.alert {
+    border-radius: 8px;
+    border: none;
+    font-weight: 500;
+}
+.spinner-border {
+    width: 3rem;
+    height: 3rem;
+}
+.progress {
+    height: 8px;
+    border-radius: 4px;
+}
+.progress-bar {
+    border-radius: 4px;
+}
+/* Comparison results styling */
+.comparison-image {
+    max-width: 100%;
+    height: auto;
+    border-radius: 8px;
+    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+    margin: 10px 0;
+}
+.difference-box {
+    border: 3px solid #dc3545;
+    border-radius: 4px;
+    position: relative;
+}
+.difference-box::after {
+    content: "Difference";
+    position: absolute;
+    top: -10px;
+    left: 10px;
+    background: #dc3545;
+    color: white;
+    padding: 2px 8px;
+    border-radius: 4px;
+    font-size: 12px;
+    font-weight: bold;
+}
+/* Table styling */
+.table {
+    border-radius: 8px;
+    overflow: hidden;
+    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+}
+.table thead th {
+    background-color: #f8f9fa;
+    border-bottom: 2px solid #dee2e6;
+    font-weight: 600;
+    color: #495057;
+}
+.table tbody tr:hover {
+    background-color: #f8f9fa;
+}
+/* Badge styling */
+.badge {
+    font-size: 0.8em;
+    padding: 6px 10px;
+    border-radius: 6px;
+}
+.badge-danger {
+    background-color: #dc3545;
+}
+.badge-warning {
+    background-color: #ffc107;
+    color: #212529;
+}
+.badge-success {
+    background-color: #28a745;
+}
+.badge-info {
+    background-color: #17a2b8;
+}
+/* Responsive design */
+@media (max-width: 768px) {
+    .container {
+        padding: 0 15px;
+    }
+    .card {
+        margin-bottom: 20px;
+    }
+    .nav-tabs .nav-link {
+        padding: 8px 12px;
+        font-size: 14px;
+    }
+    .btn-lg {
+        padding: 10px 20px;
+        font-size: 16px;
+    }
+}
+/* Loading animation */
+@keyframes pulse {
+    0% { opacity: 1; }
+    50% { opacity: 0.5; }
+    100% { opacity: 1; }
+}
+.loading-pulse {
+    animation: pulse 1.5s infinite;
+}
+/* Custom scrollbar */
+::-webkit-scrollbar {
+    width: 8px;
+}
+::-webkit-scrollbar-track {
+    background: #f1f1f1;
+    border-radius: 4px;
+}
+::-webkit-scrollbar-thumb {
+    background: #c1c1c1;
+    border-radius: 4px;
+}
+::-webkit-scrollbar-thumb:hover {
+    background: #a8a8a8;
+}
+/* Print styles */
+@media print {
+    .navbar, .btn, .nav-tabs {
+        display: none !important;
+    }
+    .card {
+        box-shadow: none !important;
+        border: 1px solid #dee2e6 !important;
+    }
+}

static/js/script.js ADDED Viewed

	@@ -0,0 +1,242 @@

+// PDF Comparison Tool JavaScript
+document.addEventListener('DOMContentLoaded', function() {
+    const uploadForm = document.getElementById('uploadForm');
+    const loadingSection = document.getElementById('loadingSection');
+    const resultsSection = document.getElementById('resultsSection');
+    const errorSection = document.getElementById('errorSection');
+    const errorMessage = document.getElementById('errorMessage');
+    // Handle form submission
+    uploadForm.addEventListener('submit', function(e) {
+        e.preventDefault();
+        const formData = new FormData(uploadForm);
+        const pdf1 = document.getElementById('pdf1').files[0];
+        const pdf2 = document.getElementById('pdf2').files[0];
+        // Validate files
+        if (!pdf1 || !pdf2) {
+            showError('Please select both PDF files.');
+            return;
+        }
+        if (!pdf1.name.toLowerCase().endsWith('.pdf') || !pdf2.name.toLowerCase().endsWith('.pdf')) {
+            showError('Please select valid PDF files.');
+            return;
+        }
+        // Show loading
+        showLoading();
+        hideError();
+        // Submit form via AJAX
+        fetch('/upload', {
+            method: 'POST',
+            body: formData
+        })
+        .then(response => response.json())
+        .then(data => {
+            hideLoading();
+            if (data.success) {
+                displayResults(data.results);
+            } else {
+                showError(data.error || 'An error occurred during comparison.');
+            }
+        })
+        .catch(error => {
+            hideLoading();
+            showError('Network error: ' + error.message);
+        });
+    });
+    function showLoading() {
+        loadingSection.style.display = 'block';
+        resultsSection.style.display = 'none';
+        errorSection.style.display = 'none';
+    }
+    function hideLoading() {
+        loadingSection.style.display = 'none';
+    }
+    function showError(message) {
+        errorMessage.textContent = message;
+        errorSection.style.display = 'block';
+        resultsSection.style.display = 'none';
+    }
+    function hideError() {
+        errorSection.style.display = 'none';
+    }
+    function displayResults(results) {
+        resultsSection.style.display = 'block';
+        // Display visual comparison
+        displayVisualComparison(results);
+        // Display spelling issues
+        displaySpellingIssues(results);
+        // Display barcodes and QR codes
+        displayBarcodes(results);
+    }
+    function displayVisualComparison(results) {
+        const visualContent = document.getElementById('visualComparisonContent');
+        let html = '<div class="row">';
+        if (results.text_comparison && results.text_comparison.length > 0) {
+            results.text_comparison.forEach((page, index) => {
+                html += `
+                    <div class="col-12 mb-4">
+                        <h6 class="text-primary mb-3">Page ${page.page}</h6>
+                        <div class="row">
+                            <div class="col-md-6">
+                                <h6>PDF 1</h6>
+                                ${page.annotated_images && page.annotated_images.pdf1 ?
+                                    `<img src="/static/${page.annotated_images.pdf1}" class="comparison-image" alt="PDF 1 Page ${page.page}">` :
+                                    '<p class="text-muted">No differences detected</p>'
+                                }
+                            </div>
+                            <div class="col-md-6">
+                                <h6>PDF 2</h6>
+                                ${page.annotated_images && page.annotated_images.pdf2 ?
+                                    `<img src="/static/${page.annotated_images.pdf2}" class="comparison-image" alt="PDF 2 Page ${page.page}">` :
+                                    '<p class="text-muted">No differences detected</p>'
+                                }
+                            </div>
+                        </div>
+                        ${page.color_differences && page.color_differences.length > 0 ?
+                            `<div class="mt-3">
+                                <span class="badge badge-danger">${page.color_differences.length} color difference(s) detected</span>
+                            </div>` :
+                            '<div class="mt-3"><span class="badge badge-success">No color differences</span></div>'
+                        }
+                    </div>
+                `;
+            });
+        } else {
+            html += '<div class="col-12"><p class="text-muted">No visual comparison data available.</p></div>';
+        }
+        html += '</div>';
+        visualContent.innerHTML = html;
+    }
+    function displaySpellingIssues(results) {
+        const spellingContent = document.getElementById('spellingIssuesContent');
+        let html = '';
+        if (results.spelling_issues && results.spelling_issues.length > 0) {
+            html += `
+                <div class="table-responsive">
+                    <table class="table table-striped">
+                        <thead>
+                            <tr>
+                                <th>Word</th>
+                                <th>Original</th>
+                                <th>English Suggestions</th>
+                                <th>French Suggestions</th>
+                            </tr>
+                        </thead>
+                        <tbody>
+            `;
+            results.spelling_issues.forEach(issue => {
+                const englishSuggestions = issue.suggestions.english.join(', ') || 'None';
+                const frenchSuggestions = issue.suggestions.french.join(', ') || 'None';
+                html += `
+                    <tr>
+                        <td><strong>${issue.word}</strong></td>
+                        <td><code>${issue.original_word}</code></td>
+                        <td>${englishSuggestions}</td>
+                        <td>${frenchSuggestions}</td>
+                    </tr>
+                `;
+            });
+            html += `
+                        </tbody>
+                    </table>
+                </div>
+                <div class="mt-3">
+                    <span class="badge badge-warning">${results.spelling_issues.length} spelling issue(s) found</span>
+                </div>
+            `;
+        } else {
+            html = '<div class="alert alert-success"><i class="fas fa-check me-2"></i>No spelling issues detected.</div>';
+        }
+        spellingContent.innerHTML = html;
+    }
+    function displayBarcodes(results) {
+        const barcodesContent = document.getElementById('barcodesContent');
+        let html = '';
+        if (results.barcodes_qr_codes && results.barcodes_qr_codes.length > 0) {
+            html += `
+                <div class="table-responsive">
+                    <table class="table table-striped">
+                        <thead>
+                            <tr>
+                                <th>Type</th>
+                                <th>Data</th>
+                                <th>Position</th>
+                            </tr>
+                        </thead>
+                        <tbody>
+            `;
+            results.barcodes_qr_codes.forEach(barcode => {
+                const position = `(${barcode.rect.left}, ${barcode.rect.top}) - (${barcode.rect.left + barcode.rect.width}, ${barcode.rect.top + barcode.rect.height})`;
+                html += `
+                    <tr>
+                        <td><span class="badge badge-info">${barcode.type}</span></td>
+                        <td><code>${barcode.data}</code></td>
+                        <td>${position}</td>
+                    </tr>
+                `;
+            });
+            html += `
+                        </tbody>
+                    </table>
+                </div>
+                <div class="mt-3">
+                    <span class="badge badge-info">${results.barcodes_qr_codes.length} barcode/QR code(s) detected</span>
+                </div>
+            `;
+        } else {
+            html = '<div class="alert alert-info"><i class="fas fa-info-circle me-2"></i>No barcodes or QR codes detected.</div>';
+        }
+        barcodesContent.innerHTML = html;
+    }
+    // Add file input change handlers for better UX
+    document.getElementById('pdf1').addEventListener('change', function(e) {
+        const file = e.target.files[0];
+        if (file) {
+            const label = e.target.nextElementSibling;
+            if (label && label.classList.contains('form-text')) {
+                label.textContent = `Selected: ${file.name}`;
+            }
+        }
+    });
+    document.getElementById('pdf2').addEventListener('change', function(e) {
+        const file = e.target.files[0];
+        if (file) {
+            const label = e.target.nextElementSibling;
+            if (label && label.classList.contains('form-text')) {
+                label.textContent = `Selected: ${file.name}`;
+            }
+        }
+    });
+});

templates/index.html ADDED Viewed

	@@ -0,0 +1,142 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>PDF Comparison Tool</title>
+    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css" rel="stylesheet">
+    <link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css" rel="stylesheet">
+    <link href="{{ url_for('static', filename='css/style.css') }}" rel="stylesheet">
+</head>
+<body>
+    <div class="container-fluid">
+        <div class="row">
+            <!-- Header -->
+            <div class="col-12">
+                <nav class="navbar navbar-expand-lg navbar-dark bg-primary">
+                    <div class="container">
+                        <a class="navbar-brand" href="#">
+                            <i class="fas fa-file-pdf me-2"></i>
+                            PDF Comparison Tool
+                        </a>
+                    </div>
+                </nav>
+            </div>
+        </div>
+        <div class="row mt-4">
+            <div class="col-12">
+                <div class="container">
+                    <!-- Upload Section -->
+                    <div class="card shadow-sm">
+                        <div class="card-header bg-light">
+                            <h5 class="mb-0">
+                                <i class="fas fa-upload me-2"></i>
+                                Upload PDF Files for Comparison
+                            </h5>
+                        </div>
+                        <div class="card-body">
+                            <form id="uploadForm" enctype="multipart/form-data">
+                                <div class="row">
+                                    <div class="col-md-6">
+                                        <div class="mb-3">
+                                            <label for="pdf1" class="form-label">First PDF File</label>
+                                            <input type="file" class="form-control" id="pdf1" name="pdf1" accept=".pdf" required>
+                                            <div class="form-text">Select a PDF file for comparison</div>
+                                        </div>
+                                    </div>
+                                    <div class="col-md-6">
+                                        <div class="mb-3">
+                                            <label for="pdf2" class="form-label">Second PDF File</label>
+                                            <input type="file" class="form-control" id="pdf2" name="pdf2" accept=".pdf" required>
+                                            <div class="form-text">Select a PDF file for comparison</div>
+                                        </div>
+                                    </div>
+                                </div>
+                                <div class="d-grid">
+                                    <button type="submit" class="btn btn-primary btn-lg">
+                                        <i class="fas fa-search me-2"></i>
+                                        Compare PDFs
+                                    </button>
+                                </div>
+                            </form>
+                        </div>
+                    </div>
+                    <!-- Loading Section -->
+                    <div id="loadingSection" class="card shadow-sm mt-4" style="display: none;">
+                        <div class="card-body text-center">
+                            <div class="spinner-border text-primary" role="status">
+                                <span class="visually-hidden">Loading...</span>
+                            </div>
+                            <p class="mt-3">Processing PDFs... This may take a few minutes.</p>
+                            <div class="progress mt-3">
+                                <div class="progress-bar progress-bar-striped progress-bar-animated" role="progressbar" style="width: 100%"></div>
+                            </div>
+                        </div>
+                    </div>
+                    <!-- Results Section -->
+                    <div id="resultsSection" class="mt-4" style="display: none;">
+                        <!-- Comparison Results Tabs -->
+                        <div class="card shadow-sm">
+                            <div class="card-header">
+                                <ul class="nav nav-tabs card-header-tabs" id="resultsTabs" role="tablist">
+                                    <li class="nav-item" role="presentation">
+                                        <button class="nav-link active" id="visual-tab" data-bs-toggle="tab" data-bs-target="#visual" type="button" role="tab">
+                                            <i class="fas fa-eye me-2"></i>Visual Comparison
+                                        </button>
+                                    </li>
+                                    <li class="nav-item" role="presentation">
+                                        <button class="nav-link" id="spelling-tab" data-bs-toggle="tab" data-bs-target="#spelling" type="button" role="tab">
+                                            <i class="fas fa-spell-check me-2"></i>Spelling Issues
+                                        </button>
+                                    </li>
+                                    <li class="nav-item" role="presentation">
+                                        <button class="nav-link" id="barcodes-tab" data-bs-toggle="tab" data-bs-target="#barcodes" type="button" role="tab">
+                                            <i class="fas fa-barcode me-2"></i>Barcodes & QR Codes
+                                        </button>
+                                    </li>
+                                </ul>
+                            </div>
+                            <div class="card-body">
+                                <div class="tab-content" id="resultsTabContent">
+                                    <!-- Visual Comparison Tab -->
+                                    <div class="tab-pane fade show active" id="visual" role="tabpanel">
+                                        <div id="visualComparisonContent">
+                                            <!-- Content will be populated by JavaScript -->
+                                        </div>
+                                    </div>
+                                    <!-- Spelling Issues Tab -->
+                                    <div class="tab-pane fade" id="spelling" role="tabpanel">
+                                        <div id="spellingIssuesContent">
+                                            <!-- Content will be populated by JavaScript -->
+                                        </div>
+                                    </div>
+                                    <!-- Barcodes Tab -->
+                                    <div class="tab-pane fade" id="barcodes" role="tabpanel">
+                                        <div id="barcodesContent">
+                                            <!-- Content will be populated by JavaScript -->
+                                        </div>
+                                    </div>
+                                </div>
+                            </div>
+                        </div>
+                    </div>
+                    <!-- Error Section -->
+                    <div id="errorSection" class="alert alert-danger mt-4" style="display: none;">
+                        <i class="fas fa-exclamation-triangle me-2"></i>
+                        <span id="errorMessage"></span>
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/js/bootstrap.bundle.min.js"></script>
+    <script src="{{ url_for('static', filename='js/script.js') }}"></script>
+</body>
+</html>

test_setup.py ADDED Viewed

	@@ -0,0 +1,133 @@

+#!/usr/bin/env python3
+"""
+Test script to verify PDF Comparison Tool setup
+"""
+import sys
+import importlib
+def test_imports():
+    """Test if all required packages can be imported"""
+    required_packages = [
+        'flask',
+        'cv2',
+        'numpy',
+        'PIL',
+        'pytesseract',
+        'pdf2image',
+        'pyzbar',
+        'spellchecker',
+        'nltk',
+        'skimage',
+        'matplotlib',
+        'pandas'
+    ]
+    print("Testing package imports...")
+    failed_imports = []
+    for package in required_packages:
+        try:
+            importlib.import_module(package)
+            print(f"✓ {package}")
+        except ImportError as e:
+            print(f"✗ {package}: {e}")
+            failed_imports.append(package)
+    return failed_imports
+def test_tesseract():
+    """Test if Tesseract OCR is available"""
+    print("\nTesting Tesseract OCR...")
+    try:
+        import pytesseract
+        # Try to get Tesseract version
+        version = pytesseract.get_tesseract_version()
+        print(f"✓ Tesseract version: {version}")
+        return True
+    except Exception as e:
+        print(f"✗ Tesseract not found: {e}")
+        print("Please install Tesseract OCR:")
+        print("  macOS: brew install tesseract")
+        print("  Ubuntu: sudo apt-get install tesseract-ocr")
+        print("  Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki")
+        return False
+def test_pdf_comparator():
+    """Test if PDFComparator class can be instantiated"""
+    print("\nTesting PDFComparator...")
+    try:
+        from pdf_comparator import PDFComparator
+        comparator = PDFComparator()
+        print("✓ PDFComparator initialized successfully")
+        return True
+    except Exception as e:
+        print(f"✗ PDFComparator error: {e}")
+        return False
+def test_flask_app():
+    """Test if Flask app can be imported"""
+    print("\nTesting Flask application...")
+    try:
+        from app import app
+        print("✓ Flask app imported successfully")
+        return True
+    except Exception as e:
+        print(f"✗ Flask app error: {e}")
+        return False
+def main():
+    """Run all tests"""
+    print("PDF Comparison Tool - Setup Test")
+    print("=" * 40)
+    # Test imports
+    failed_imports = test_imports()
+    # Test Tesseract
+    tesseract_ok = test_tesseract()
+    # Test PDFComparator
+    comparator_ok = test_pdf_comparator()
+    # Test Flask app
+    flask_ok = test_flask_app()
+    # Summary
+    print("\n" + "=" * 40)
+    print("SETUP SUMMARY")
+    print("=" * 40)
+    if failed_imports:
+        print(f"✗ Missing packages: {', '.join(failed_imports)}")
+        print("Run: pip install -r requirements.txt")
+    else:
+        print("✓ All packages imported successfully")
+    if tesseract_ok:
+        print("✓ Tesseract OCR is available")
+    else:
+        print("✗ Tesseract OCR is not available")
+    if comparator_ok:
+        print("✓ PDFComparator is working")
+    else:
+        print("✗ PDFComparator has issues")
+    if flask_ok:
+        print("✓ Flask application is ready")
+    else:
+        print("✗ Flask application has issues")
+    # Overall status
+    all_ok = not failed_imports and tesseract_ok and comparator_ok and flask_ok
+    if all_ok:
+        print("\n🎉 Setup is complete! You can run the application with:")
+        print("   python app.py")
+    else:
+        print("\n⚠️  Setup is incomplete. Please fix the issues above.")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()