2pac / find_bad_images.py
Richard Young
Initial commit for Hugging Face Space
c43a81f
#!/usr/bin/env python3
"""
2PAC: The Picture Analyzer & Corruption killer
Author: Richard Young
License: MIT
In memory of Jeff Young, who loved Tupac's music and lived by his values of helping others.
Like Tupac, Jeff believed in bringing people together and always lending a hand to those in need.
May your photos always be as clear as the memories they capture, and may we all strive to help others as Jeff did.
"""
import os
import argparse
import concurrent.futures
import sys
import time
import io
import json
import shutil
import hashlib
import struct
import tempfile
import subprocess
import random
from datetime import datetime
from pathlib import Path
from PIL import Image, ImageFile, UnidentifiedImageError
from tqdm import tqdm
import tqdm.auto as tqdm_auto
import colorama
import humanize
import logging
# Import 2PAC quotes
try:
from quotes import QUOTES
except ImportError:
# Default quotes if file is missing
QUOTES = ["All Eyez On Your Images."]
# Initialize colorama (required for Windows)
colorama.init()
# Allow loading of truncated images for repair attempts
ImageFile.LOAD_TRUNCATED_IMAGES = True
# Dictionary of supported image formats with their extensions
SUPPORTED_FORMATS = {
'JPEG': ('.jpg', '.jpeg', '.jpe', '.jif', '.jfif', '.jfi'),
'PNG': ('.png',),
'GIF': ('.gif',),
'TIFF': ('.tiff', '.tif'),
'BMP': ('.bmp', '.dib'),
'WEBP': ('.webp',),
'ICO': ('.ico',),
'HEIC': ('.heic',),
}
# Default formats (all supported formats)
DEFAULT_FORMATS = list(SUPPORTED_FORMATS.keys())
# List of formats that can potentially be repaired
REPAIRABLE_FORMATS = ['JPEG', 'PNG', 'GIF']
# Default progress directory
DEFAULT_PROGRESS_DIR = os.path.expanduser("~/.bad_image_finder/progress")
# Current version
VERSION = "1.5.1"
# Security: Maximum file size to process (100MB) to prevent DoS
MAX_FILE_SIZE = 100 * 1024 * 1024
# Security: Maximum image dimensions (50 megapixels) to prevent decompression bombs
MAX_IMAGE_PIXELS = 50000 * 50000
def setup_logging(verbose, no_color=False):
level = logging.DEBUG if verbose else logging.INFO
# Define color codes
if not no_color:
# Color scheme
COLORS = {
'DEBUG': colorama.Fore.CYAN,
'INFO': colorama.Fore.GREEN,
'WARNING': colorama.Fore.YELLOW,
'ERROR': colorama.Fore.RED,
'CRITICAL': colorama.Fore.MAGENTA + colorama.Style.BRIGHT,
'RESET': colorama.Style.RESET_ALL
}
# Custom formatter with colors
class ColoredFormatter(logging.Formatter):
def format(self, record):
levelname = record.levelname
if levelname in COLORS:
record.levelname = f"{COLORS[levelname]}{levelname}{COLORS['RESET']}"
record.msg = f"{COLORS[levelname]}{record.msg}{COLORS['RESET']}"
return super().format(record)
formatter = ColoredFormatter('%(asctime)s - %(levelname)s - %(message)s')
else:
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler = logging.StreamHandler()
handler.setFormatter(formatter)
logging.basicConfig(
level=level,
handlers=[handler]
)
def diagnose_image_issue(file_path):
"""
Attempts to diagnose what's wrong with the image.
Returns: (error_type, details)
"""
try:
with open(file_path, 'rb') as f:
header = f.read(16) # Read first 16 bytes
# Check for zero-byte file
if len(header) == 0:
return "empty_file", "File is empty (0 bytes)"
# Check for correct JPEG header
if file_path.lower().endswith(SUPPORTED_FORMATS['JPEG']):
if not (header.startswith(b'\xff\xd8\xff')):
return "invalid_header", "Invalid JPEG header"
# Check for correct PNG header
elif file_path.lower().endswith(SUPPORTED_FORMATS['PNG']):
if not header.startswith(b'\x89PNG\r\n\x1a\n'):
return "invalid_header", "Invalid PNG header"
# Try to open with PIL for more detailed diagnosis
try:
with Image.open(file_path) as img:
img.verify()
except Exception as e:
error_str = str(e).lower()
if "truncated" in error_str:
return "truncated", "File is truncated"
elif "corrupt" in error_str:
return "corrupt_data", "Data corruption detected"
elif "incorrect mode" in error_str or "decoder" in error_str:
return "decoder_issue", "Image decoder issue"
else:
return "unknown", f"Unknown issue: {str(e)}"
# Now try to load the data
try:
with Image.open(file_path) as img:
img.load()
except Exception as e:
return "data_load_failed", f"Image data couldn't be loaded: {str(e)}"
# If we got here, there's some other issue
return "unknown", "Unknown issue"
except Exception as e:
return "access_error", f"Error accessing file: {str(e)}"
def check_jpeg_structure(file_path):
"""
Performs a deep check of JPEG file structure to find corruption that PIL might miss.
Returns (is_valid, error_message)
"""
try:
with open(file_path, 'rb') as f:
data = f.read()
# Check for correct JPEG header (SOI marker)
if not data.startswith(b'\xFF\xD8'):
return False, "Invalid JPEG header (missing SOI marker)"
# Check for proper EOI marker at the end
if not data.endswith(b'\xFF\xD9'):
return False, "Missing EOI marker at end of file"
# Check for key JPEG segments
# SOF marker (Start of Frame) - At least one should be present
sof_markers = [b'\xFF\xC0', b'\xFF\xC1', b'\xFF\xC2', b'\xFF\xC3']
has_sof = any(marker in data for marker in sof_markers)
if not has_sof:
return False, "No Start of Frame (SOF) marker found"
# Check for SOS marker (Start of Scan)
if b'\xFF\xDA' not in data:
return False, "No Start of Scan (SOS) marker found"
# Scan through the file to check marker structure
i = 2 # Skip SOI marker
while i < len(data) - 1:
if data[i] == 0xFF and data[i+1] != 0x00 and data[i+1] != 0xFF:
# Found a marker
marker = data[i:i+2]
# For markers with length fields, validate length
if (0xC0 <= data[i+1] <= 0xCF and data[i+1] != 0xC4 and data[i+1] != 0xC8) or \
(0xDB <= data[i+1] <= 0xFE):
if i + 4 >= len(data):
return False, f"Truncated marker {data[i+1]:02X} at position {i}"
length = struct.unpack('>H', data[i+2:i+4])[0]
if i + 2 + length > len(data):
return False, f"Invalid segment length for marker {data[i+1]:02X}"
i += 2 + length
continue
# Move to next byte
i += 1
return True, "JPEG structure appears valid"
except Exception as e:
return False, f"Error during JPEG structure check: {str(e)}"
def check_png_structure(file_path):
"""
Performs a deep check of PNG file structure to find corruption.
Returns (is_valid, error_message)
"""
try:
with open(file_path, 'rb') as f:
data = f.read()
# Check for PNG signature
png_signature = b'\x89PNG\r\n\x1a\n'
if not data.startswith(png_signature):
return False, "Invalid PNG signature"
# Check minimum viable PNG (signature + IHDR chunk)
if len(data) < 8 + 12: # 8 bytes signature + 12 bytes min IHDR chunk
return False, "PNG file too small to contain valid header"
# Check for IEND chunk at the end
if not data.endswith(b'IEND\xaeB`\x82'):
return False, "Missing IEND chunk at end of file"
# Parse chunks
pos = 8 # Skip signature
required_chunks = {'IHDR': False}
while pos < len(data):
if pos + 8 > len(data):
return False, "Truncated chunk header"
# Read chunk length and type
chunk_len = struct.unpack('>I', data[pos:pos+4])[0]
chunk_type = data[pos+4:pos+8].decode('ascii', errors='replace')
# Validate chunk length
if pos + chunk_len + 12 > len(data):
return False, f"Truncated {chunk_type} chunk"
# Track required chunks
if chunk_type in required_chunks:
required_chunks[chunk_type] = True
# Special validation for IHDR chunk
if chunk_type == 'IHDR' and chunk_len != 13:
return False, "Invalid IHDR chunk length"
# Mandatory IHDR must be first chunk
if pos == 8 and chunk_type != 'IHDR':
return False, "First chunk must be IHDR"
# IEND must be the last chunk
if chunk_type == 'IEND' and pos + chunk_len + 12 != len(data):
return False, "Data after IEND chunk"
# Move to next chunk
pos += chunk_len + 12 # Length (4) + Type (4) + Data (chunk_len) + CRC (4)
# Verify required chunks
for chunk, present in required_chunks.items():
if not present:
return False, f"Missing required {chunk} chunk"
return True, "PNG structure appears valid"
except Exception as e:
return False, f"Error during PNG structure check: {str(e)}"
def validate_subprocess_path(file_path):
"""
Validate file path before passing to subprocess to prevent command injection.
Args:
file_path: Path to validate
Returns:
True if path is safe
Raises:
ValueError: If path contains dangerous characters or patterns
"""
import re
# Must be an absolute path
if not os.path.isabs(file_path):
raise ValueError(f"Path must be absolute: {file_path}")
# File must exist
if not os.path.exists(file_path):
raise ValueError(f"File does not exist: {file_path}")
# Check for shell metacharacters and dangerous patterns
# Allow: alphanumeric, spaces, dots, dashes, underscores, forward slashes
# Block: semicolons, pipes, backticks, $, &, >, <, etc.
dangerous_chars = ['`', '$', '&', '|', ';', '>', '<', '\n', '\r', '(', ')']
for char in dangerous_chars:
if char in file_path:
raise ValueError(f"Dangerous character '{char}' found in path: {file_path}")
# Block path traversal attempts
if '..' in file_path:
raise ValueError(f"Path traversal pattern '..' detected: {file_path}")
# Block null bytes
if '\x00' in file_path:
raise ValueError("Null byte detected in path")
return True
def try_external_tools(file_path):
"""
Try using external tools to validate the image if they're available.
Returns (is_valid, message)
Security: Validates file path before passing to subprocess to prevent
command injection attacks.
"""
# Validate path before passing to subprocess
try:
validate_subprocess_path(file_path)
except ValueError as e:
logging.warning(f"Skipping external tool validation due to security check: {e}")
return True, "External tools check skipped (security)"
# Try using exiftool if available
try:
result = subprocess.run(['exiftool', '-m', '-p', '$Error', file_path],
capture_output=True, text=True, timeout=5)
if result.returncode == 0 and result.stdout.strip():
return False, f"Exiftool error: {result.stdout.strip()}"
# Check with identify (ImageMagick) if available
result = subprocess.run(['identify', '-verbose', file_path],
capture_output=True, text=True, timeout=5)
if result.returncode != 0:
return False, "ImageMagick identify failed to read the image"
return True, "Passed external tool validation"
except (subprocess.SubprocessError, FileNotFoundError):
# External tools not available or failed
return True, "External tools check skipped"
def try_full_decode_check(file_path):
"""
Try to fully decode the image to a temporary file.
This catches more subtle corruption that might otherwise be missed.
"""
try:
# For JPEGs, try to decode and re-encode the image
with Image.open(file_path) as img:
# Create a temporary file for testing
with tempfile.NamedTemporaryFile(delete=True) as tmp:
# Try to save a decoded copy
img.save(tmp.name, format="BMP")
# If we get here, the image data could be fully decoded
return True, "Full decode test passed"
except Exception as e:
return False, f"Full decode test failed: {str(e)}"
def check_visual_corruption(file_path, block_threshold=0.20, uniform_threshold=10, strict_mode=False):
"""
Analyze image content to detect visual corruption like large uniform areas.
Args:
file_path: Path to the image file
block_threshold: Percentage of image that must be uniform to be considered corrupt (0.0-1.0)
uniform_threshold: Color variation threshold for considering pixels "uniform"
strict_mode: If True, only detect gray/black areas as corruption indicators
Returns:
(is_visually_corrupt, details)
"""
try:
with Image.open(file_path) as img:
# Get image dimensions
width, height = img.size
total_pixels = width * height
# Convert to RGB to ensure consistent analysis
if img.mode != "RGB":
img = img.convert("RGB")
# Sample the image (analyzing every pixel would be too slow)
# We'll create a grid of sample points - we'll use more samples for more accuracy
sample_step = max(1, min(width, height) // 150) # Adjust based on image size
# Track unique colors and their counts
color_counts = {}
total_samples = 0
# Sample the image
for y in range(0, height, sample_step):
for x in range(0, width, sample_step):
total_samples += 1
pixel = img.getpixel((x, y))
# Round pixel values to reduce sensitivity to minor variations
rounded_pixel = (
pixel[0] // uniform_threshold * uniform_threshold,
pixel[1] // uniform_threshold * uniform_threshold,
pixel[2] // uniform_threshold * uniform_threshold
)
if rounded_pixel in color_counts:
color_counts[rounded_pixel] += 1
else:
color_counts[rounded_pixel] = 1
# Find the most common color
most_common_color = max(color_counts.items(), key=lambda x: x[1])
most_common_percentage = most_common_color[1] / total_samples
# Check for large blocks of uniform color (potential corruption)
if most_common_percentage > block_threshold:
# Calculate approximate percentage of the image affected
affected_pct = most_common_percentage * 100
color_value = most_common_color[0]
# Determine if this is likely corruption
# Gray/black areas are common in corruption
is_dark = sum(color_value) < 3 * uniform_threshold # Very dark areas
# Check if it's a gray area (equal R,G,B values)
is_gray = abs(color_value[0] - color_value[1]) < uniform_threshold and \
abs(color_value[1] - color_value[2]) < uniform_threshold and \
abs(color_value[0] - color_value[2]) < uniform_threshold
# Only consider mid-range grays as corruption indicators (not white/black)
is_mid_gray = is_gray and 30 < sum(color_value)/3 < 220
# Special case: almost pure white is often legitimate content
is_white = color_value[0] > 240 and color_value[1] > 240 and color_value[2] > 240
# Determine likelihood of corruption based on color and percentage
if (is_dark or is_mid_gray) and not is_white:
# Higher threshold for white areas since they're common in legitimate images
white_threshold = 0.4 # 40% of image
if is_white and most_common_percentage < white_threshold:
return False, f"Large white area ({affected_pct:.1f}%) but likely not corruption"
# More likely to be corruption
return True, f"Visual corruption detected: {affected_pct:.1f}% of image is uniform {color_value}"
else:
# Could be a legitimate image with a uniform background
return False, f"Large uniform area ({affected_pct:.1f}%) but likely not corruption"
# Check for other telltale signs of corruption - but only in strict mode
if strict_mode:
# 1. Excessive color blocks (fragmentation) - this works well for detecting noise
if len(color_counts) > total_samples * 0.85 and total_samples > 200:
return True, f"Excessive color fragmentation detected ({len(color_counts)} colors in {total_samples} samples)"
# 2. Check for very specific corruption patterns
# Analyze distribution of colors to look for unusual patterns
if total_samples > 500: # Only for larger images with enough samples
# Check if there's an unnatural color distribution
# Normal photos have a more gradual distribution rather than spikes
sorted_counts = sorted(color_counts.values(), reverse=True)
# Calculate the color distribution ratio
if len(sorted_counts) > 5:
top5_ratio = sum(sorted_counts[:5]) / sum(sorted_counts)
# Usually, the top 5 colors shouldn't dominate more than 80% of the image
# unless it's a graphic or very simple image
if top5_ratio < 0.2 and most_common_percentage < 0.1:
return True, f"Unusual color distribution (possible noise/corruption)"
return False, "No visual corruption detected"
except Exception as e:
return False, f"Error during visual analysis: {str(e)}"
def is_valid_image(file_path, thorough=True, sensitivity='medium', ignore_eof=False, check_visual=False, visual_strictness='medium'):
"""
Validate image file integrity using multiple methods.
Args:
file_path: Path to the image file
thorough: Whether to perform deep structure validation
sensitivity: 'low', 'medium', or 'high'
ignore_eof: Whether to ignore missing end-of-file markers
check_visual: Whether to perform visual content analysis to detect corruption
visual_strictness: 'low', 'medium', or 'high' strictness for visual corruption detection
Returns:
True if valid, False if corrupt.
"""
# Basic PIL validation first (fast check)
try:
with Image.open(file_path) as img:
# verify() checks the file header
img.verify()
# Additional step: try to load the image data
# This catches more corruption issues
with Image.open(file_path) as img2:
img2.load()
# If check_visual is enabled, analyze the image content
if check_visual:
# Set thresholds based on strictness level
if visual_strictness == 'low':
# More permissive - only detect very obvious corruption
block_threshold = 0.3 # 30% of the image must be uniform
uniform_threshold = 5 # Smaller color variations are allowed
elif visual_strictness == 'high':
# Most strict - catches subtle corruption but may have false positives
block_threshold = 0.15 # Only 15% of the image needs to be uniform
uniform_threshold = 15 # Larger color variations are considered uniform
else: # medium (default)
block_threshold = 0.20 # 20% threshold
uniform_threshold = 10
# Check for visual corruption with appropriate thresholds
is_visually_corrupt, msg = check_visual_corruption(
file_path,
block_threshold=block_threshold,
uniform_threshold=uniform_threshold,
# Only use additional detection methods in high strictness mode
strict_mode=(visual_strictness == 'high')
)
if is_visually_corrupt:
logging.debug(f"Visual corruption detected in {file_path}: {msg}")
return False
# If thorough checking is disabled, return after basic check
if not thorough or sensitivity == 'low':
return True
# For JPEG files, do additional structure checking
if file_path.lower().endswith(tuple(SUPPORTED_FORMATS['JPEG'])):
# Check JPEG structure
is_valid, error_msg = check_jpeg_structure(file_path)
if not is_valid:
# If ignore_eof is enabled and the only issue is missing EOI marker, consider it valid
if ignore_eof and error_msg == "Missing EOI marker at end of file":
logging.debug(f"Ignoring missing EOI marker for {file_path} as requested")
else:
logging.debug(f"JPEG structure invalid for {file_path}: {error_msg}")
return False
# Try full decode test (catches subtle corruption)
is_valid, error_msg = try_full_decode_check(file_path)
if not is_valid:
logging.debug(f"Full decode test failed for {file_path}: {error_msg}")
return False
# Try external tools if applicable
is_valid, error_msg = try_external_tools(file_path)
if not is_valid:
logging.debug(f"External tool validation failed for {file_path}: {error_msg}")
return False
# For PNG files, do additional structure checking
elif file_path.lower().endswith(tuple(SUPPORTED_FORMATS['PNG'])):
# Check PNG structure
is_valid, error_msg = check_png_structure(file_path)
if not is_valid:
logging.debug(f"PNG structure invalid for {file_path}: {error_msg}")
return False
# Try full decode test (catches subtle corruption)
is_valid, error_msg = try_full_decode_check(file_path)
if not is_valid:
logging.debug(f"Full decode test failed for {file_path}: {error_msg}")
return False
return True
except Exception as e:
logging.debug(f"Invalid image {file_path}: {str(e)}")
return False
def attempt_repair(file_path, backup_dir=None):
"""
Attempts to repair corrupt image files.
Returns: (success, message, fixed_width, fixed_height)
"""
# Create backup if requested
if backup_dir:
backup_path = os.path.join(backup_dir, os.path.basename(file_path) + ".bak")
try:
shutil.copy2(file_path, backup_path)
logging.debug(f"Created backup at {backup_path}")
except Exception as e:
logging.warning(f"Could not create backup: {str(e)}")
try:
# First, diagnose the issue
issue_type, details = diagnose_image_issue(file_path)
logging.debug(f"Diagnosis for {file_path}: {issue_type} - {details}")
file_ext = os.path.splitext(file_path)[1].lower()
# Check if file format is supported for repair
format_supported = False
for fmt in REPAIRABLE_FORMATS:
if file_ext in SUPPORTED_FORMATS[fmt]:
format_supported = True
break
if not format_supported:
return False, f"Format not supported for repair ({file_ext})", None, None
# Try to open and resave the image with PIL's error forgiveness
# This works for many truncated files
try:
with Image.open(file_path) as img:
width, height = img.size
format = img.format
# Create a buffer for the fixed image
buffer = io.BytesIO()
img.save(buffer, format=format)
# Write the repaired image back to the original file
with open(file_path, 'wb') as f:
f.write(buffer.getvalue())
# Verify the repaired image
if is_valid_image(file_path):
return True, f"Repaired {issue_type} issue", width, height
else:
# If verification fails, try again with JPEG specific options for JPEG files
if format == 'JPEG':
with Image.open(file_path) as img:
buffer = io.BytesIO()
# Use optimize=True and quality=85 for better repair chances
img.save(buffer, format='JPEG', optimize=True, quality=85)
with open(file_path, 'wb') as f:
f.write(buffer.getvalue())
if is_valid_image(file_path):
return True, f"Repaired {issue_type} issue with JPEG optimization", width, height
return False, f"Failed to repair {issue_type} issue", None, None
except Exception as e:
logging.debug(f"Repair attempt failed for {file_path}: {str(e)}")
return False, f"Repair failed: {str(e)}", None, None
except Exception as e:
logging.debug(f"Error during repair of {file_path}: {str(e)}")
return False, f"Repair error: {str(e)}", None, None
def process_file(args):
"""Process a single image file."""
file_path, repair_mode, repair_dir, thorough_check, sensitivity, ignore_eof, check_visual, visual_strictness, enable_security_checks = args
# Security validation (if enabled)
if enable_security_checks:
try:
is_safe, warnings = validate_file_security(file_path, check_size=True, check_dimensions=True)
# Log security warnings
for warning in warnings:
logging.warning(f"Security warning for {file_path}: {warning}")
if not is_safe:
# File failed security checks - treat as invalid
size = os.path.getsize(file_path)
return file_path, False, size, "security_failed", "Failed security validation", None
except ValueError as e:
# Critical security failure (file too large, dimensions too big, etc.)
logging.error(f"Security check failed for {file_path}: {e}")
size = os.path.getsize(file_path) if os.path.exists(file_path) else 0
return file_path, False, size, "security_failed", str(e), None
except Exception as e:
# Unexpected error during security validation
logging.debug(f"Security validation error for {file_path}: {e}")
# Continue processing anyway for this case
# Check if the image is valid
is_valid = is_valid_image(file_path, thorough=thorough_check, sensitivity=sensitivity,
ignore_eof=ignore_eof, check_visual=check_visual, visual_strictness=visual_strictness)
if not is_valid and repair_mode:
# Try to repair the file
repair_success, repair_msg, width, height = attempt_repair(file_path, repair_dir)
if repair_success:
# File was repaired
return file_path, True, 0, "repaired", repair_msg, (width, height)
else:
# File is still corrupt
size = os.path.getsize(file_path)
return file_path, False, size, "repair_failed", repair_msg, None
else:
# No repair attempted or file is valid
size = os.path.getsize(file_path) if not is_valid else 0
return file_path, is_valid, size, "not_repaired", None, None
def get_session_id(directory, formats, recursive):
"""Generate a unique session ID based on scan parameters."""
# Create a unique identifier for this scan session
dir_path = str(directory).encode('utf-8')
formats_str = ",".join(sorted(formats)).encode('utf-8')
recursive_str = str(recursive).encode('utf-8')
# Use SHA256 instead of MD5 for better security
# MD5 is cryptographically broken and should not be used
hash_obj = hashlib.sha256()
hash_obj.update(dir_path)
hash_obj.update(formats_str)
hash_obj.update(recursive_str)
return hash_obj.hexdigest()[:16] # Use first 16 chars of hash for uniqueness
def _deduplicate(seq):
"""Return a list with duplicates removed while preserving order."""
seen = set()
deduped = []
for item in seq:
if item not in seen:
deduped.append(item)
seen.add(item)
return deduped
def validate_file_security(file_path, check_size=True, check_dimensions=True):
"""
Perform security validation on a file before processing.
Args:
file_path: Path to the file
check_size: Whether to check file size limits
check_dimensions: Whether to check image dimension limits
Returns:
(is_safe, warnings) - tuple of boolean and list of warning messages
Raises:
ValueError: If file fails critical security checks
"""
warnings = []
# Check if file exists
if not os.path.exists(file_path):
raise ValueError(f"File does not exist: {file_path}")
# Check file size to prevent DoS via huge files
if check_size:
file_size = os.path.getsize(file_path)
if file_size > MAX_FILE_SIZE:
raise ValueError(f"File too large ({file_size} bytes, max {MAX_FILE_SIZE}). "
f"This could indicate a malicious file or decompression bomb.")
# Warn about suspiciously large files (over 10MB for images is unusual)
if file_size > 10 * 1024 * 1024:
warnings.append(f"Large file size: {humanize.naturalsize(file_size)}")
# Check image dimensions to prevent decompression bombs
if check_dimensions:
try:
with Image.open(file_path) as img:
width, height = img.size
total_pixels = width * height
if total_pixels > MAX_IMAGE_PIXELS:
raise ValueError(f"Image dimensions too large ({width}x{height} = {total_pixels} pixels, "
f"max {MAX_IMAGE_PIXELS}). This could be a decompression bomb attack.")
# Warn about very large images
if total_pixels > 10000 * 10000:
warnings.append(f"Large image dimensions: {width}x{height}")
# Check for format mismatch (file extension vs actual format)
actual_format = img.format
expected_formats = []
for fmt, extensions in SUPPORTED_FORMATS.items():
if file_path.lower().endswith(extensions):
expected_formats.append(fmt)
if actual_format and expected_formats and actual_format not in expected_formats:
warnings.append(f"Format mismatch: file has '{file_path.split('.')[-1]}' extension "
f"but is actually '{actual_format}' format")
except UnidentifiedImageError:
raise ValueError(f"Cannot identify image format - file may be corrupted or malicious")
except Exception as e:
raise ValueError(f"Error validating image: {str(e)}")
return True, warnings
def calculate_file_hash(file_path, algorithm='sha256'):
"""
Calculate cryptographic hash of a file.
Args:
file_path: Path to the file
algorithm: Hash algorithm to use (sha256, sha512, etc.)
Returns:
Hexadecimal hash string
"""
hash_obj = hashlib.new(algorithm)
# Read file in chunks to handle large files
with open(file_path, 'rb') as f:
for chunk in iter(lambda: f.read(4096), b''):
hash_obj.update(chunk)
return hash_obj.hexdigest()
def safe_join_path(base_dir, user_path):
"""
Safely join paths and prevent path traversal attacks.
Args:
base_dir: Base directory (trusted)
user_path: User-provided path component (untrusted)
Returns:
Safe absolute path within base_dir
Raises:
ValueError: If path traversal is detected
"""
# Normalize base directory
base_dir = os.path.abspath(base_dir)
# Join paths
full_path = os.path.normpath(os.path.join(base_dir, user_path))
# Resolve any symlinks
full_path = os.path.abspath(full_path)
# Ensure the result is within base_dir
if not full_path.startswith(base_dir + os.sep) and full_path != base_dir:
raise ValueError(f"Path traversal detected: '{user_path}' resolves outside base directory")
return full_path
def save_progress(session_id, directory, formats, recursive, processed_files,
bad_files, repaired_files, progress_dir=DEFAULT_PROGRESS_DIR):
"""Save the current progress to a file."""
# Create progress directory if it doesn't exist
if not os.path.exists(progress_dir):
os.makedirs(progress_dir, exist_ok=True)
# Create a progress state object
progress_state = {
'version': VERSION,
'timestamp': datetime.now().isoformat(),
'directory': str(directory),
'formats': formats,
'recursive': recursive,
'processed_files': _deduplicate(processed_files),
'bad_files': _deduplicate(bad_files),
'repaired_files': _deduplicate(repaired_files)
}
# Save to file using JSON instead of pickle for security
# This prevents arbitrary code execution via malicious progress files
progress_file = os.path.join(progress_dir, f"session_{session_id}.progress.json")
with open(progress_file, 'w') as f:
json.dump(progress_state, f, indent=2)
logging.debug(f"Progress saved to {progress_file}")
return progress_file
def load_progress(session_id, progress_dir=DEFAULT_PROGRESS_DIR):
"""Load progress from a saved session."""
# Try new JSON format first (more secure)
progress_file_json = os.path.join(progress_dir, f"session_{session_id}.progress.json")
progress_file_legacy = os.path.join(progress_dir, f"session_{session_id}.progress")
# Prefer JSON format for security
if os.path.exists(progress_file_json):
progress_file = progress_file_json
use_json = True
elif os.path.exists(progress_file_legacy):
progress_file = progress_file_legacy
use_json = False
logging.warning("Loading legacy pickle format. This format is deprecated for security reasons.")
else:
return None
try:
if use_json:
# Secure JSON deserialization
with open(progress_file, 'r') as f:
progress_state = json.load(f)
else:
# Legacy pickle support (with warning)
# TODO: Remove pickle support in future versions
import pickle
with open(progress_file, 'rb') as f:
progress_state = pickle.load(f)
logging.warning("SECURITY WARNING: Loaded progress file using unsafe pickle format. "
"Please delete old .progress files and use new .progress.json format.")
# Remove any duplicate entries from lists
for key in ('processed_files', 'bad_files', 'repaired_files'):
if key in progress_state:
progress_state[key] = _deduplicate(progress_state[key])
# Check version compatibility
if progress_state.get('version', '0.0.0') != VERSION:
logging.warning("Progress file was created with a different version. Some incompatibilities may exist.")
logging.info(f"Loaded progress from {progress_file}")
return progress_state
except Exception as e:
logging.error(f"Failed to load progress: {str(e)}")
return None
def list_saved_sessions(progress_dir=DEFAULT_PROGRESS_DIR):
"""List all saved sessions with their details."""
if not os.path.exists(progress_dir):
return []
sessions = []
for filename in os.listdir(progress_dir):
# Support both new JSON format and legacy pickle format
if filename.endswith('.progress.json') or filename.endswith('.progress'):
try:
filepath = os.path.join(progress_dir, filename)
use_json = filename.endswith('.progress.json')
if use_json:
with open(filepath, 'r') as f:
progress_state = json.load(f)
else:
# Legacy pickle format
import pickle
with open(filepath, 'rb') as f:
progress_state = pickle.load(f)
# Extract session ID from filename
if filename.endswith('.progress.json'):
session_id = filename.replace('session_', '').replace('.progress.json', '')
else:
session_id = filename.replace('session_', '').replace('.progress', '')
session_info = {
'id': session_id,
'timestamp': progress_state.get('timestamp', 'Unknown'),
'directory': progress_state.get('directory', 'Unknown'),
'formats': progress_state.get('formats', []),
'processed_count': len(progress_state.get('processed_files', [])),
'bad_count': len(progress_state.get('bad_files', [])),
'repaired_count': len(progress_state.get('repaired_files', [])),
'filepath': filepath,
'format': 'JSON' if use_json else 'Pickle (Legacy)'
}
sessions.append(session_info)
except Exception as e:
logging.debug(f"Failed to load session from {filename}: {str(e)}")
# Sort by timestamp, newest first
sessions.sort(key=lambda x: x['timestamp'], reverse=True)
return sessions
def get_extensions_for_formats(formats):
"""Get all file extensions for the specified formats."""
extensions = []
for fmt in formats:
if fmt in SUPPORTED_FORMATS:
extensions.extend(SUPPORTED_FORMATS[fmt])
return tuple(extensions)
def find_image_files(directory, formats, recursive=True):
"""Find all image files of specified formats in a directory."""
image_files = []
extensions = get_extensions_for_formats(formats)
if not extensions:
logging.warning("No valid image formats specified!")
return []
format_names = ", ".join(formats)
if recursive:
logging.info(f"Recursively scanning for {format_names} files...")
for root, _, files in os.walk(directory):
for file in files:
if file.lower().endswith(extensions):
image_files.append(os.path.join(root, file))
else:
logging.info(f"Scanning for {format_names} files in {directory} (non-recursive)...")
for file in os.listdir(directory):
if os.path.isfile(os.path.join(directory, file)) and file.lower().endswith(extensions):
image_files.append(os.path.join(directory, file))
logging.info(f"Found {len(image_files)} image files")
return image_files
def process_images(directory, formats, dry_run=True, repair=False,
max_workers=None, recursive=True, move_to=None, repair_dir=None,
save_progress_interval=5, resume_session=None, progress_dir=DEFAULT_PROGRESS_DIR,
thorough_check=False, sensitivity='medium', ignore_eof=False, check_visual=False,
visual_strictness='medium', enable_security_checks=False):
"""Find corrupt image files and optionally repair, delete, or move them."""
start_time = time.time()
# Generate session ID for this scan
session_id = get_session_id(directory, formats, recursive)
processed_files = []
bad_files = []
repaired_files = []
total_size_saved = 0
last_progress_save = time.time()
# If resuming, load previous progress
if resume_session:
try:
progress = load_progress(resume_session, progress_dir)
if progress and progress['directory'] == str(directory) and progress['formats'] == formats:
processed_files = progress['processed_files']
bad_files = progress['bad_files']
repaired_files = progress['repaired_files']
logging.info(f"Resuming session: {len(processed_files)} files already processed")
else:
if progress:
logging.warning("Session parameters don't match current parameters. Starting fresh scan.")
else:
logging.warning(f"Couldn't find session {resume_session}. Starting fresh scan.")
except Exception as e:
logging.error(f"Error loading session: {str(e)}. Starting fresh scan.")
# Find all image files
image_files = find_image_files(directory, formats, recursive)
if not image_files:
logging.warning("No image files found!")
return [], [], 0
# Filter out already processed files if resuming
if processed_files:
remaining_files = [f for f in image_files if f not in processed_files]
skipped_count = len(image_files) - len(remaining_files)
image_files = remaining_files
logging.info(f"Skipping {skipped_count} already processed files")
if not image_files:
logging.info("All files have already been processed in the previous session!")
return bad_files, repaired_files, total_size_saved
# Create directories if they don't exist
if move_to and not os.path.exists(move_to):
os.makedirs(move_to)
logging.info(f"Created directory for corrupt files: {move_to}")
if repair and repair_dir and not os.path.exists(repair_dir):
os.makedirs(repair_dir)
logging.info(f"Created directory for backup files: {repair_dir}")
# Prepare input arguments for workers
input_args = [(file_path, repair, repair_dir, thorough_check, sensitivity, ignore_eof, check_visual, visual_strictness, enable_security_checks) for file_path in image_files]
# Process files in parallel
logging.info("Processing files in parallel...")
# Create a custom progress bar class that saves progress periodically
class ProgressSavingBar(tqdm_auto.tqdm):
def update(self, n=1):
nonlocal last_progress_save, processed_files
result = super().update(n)
# Save progress periodically
current_time = time.time()
if save_progress_interval > 0 and current_time - last_progress_save >= save_progress_interval * 60:
# Save the progress using the list of files that have actually
# completed processing. ``processed_files`` is updated as each
# future finishes so we can safely persist it as-is.
save_progress(
session_id,
directory,
formats,
recursive,
processed_files,
bad_files,
repaired_files,
progress_dir,
)
last_progress_save = current_time
logging.debug(f"Progress saved at {self.n} / {len(image_files)} files")
return result
try:
with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
# Colorful progress bar with progress saving
results = []
futures = {executor.submit(process_file, arg): arg[0] for arg in input_args}
with ProgressSavingBar(
total=len(image_files),
desc=f"{colorama.Fore.BLUE}Checking image files{colorama.Style.RESET_ALL}",
unit="file",
bar_format="{desc}: {percentage:3.0f}%|{bar:30}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]",
colour="blue"
) as pbar:
for future in concurrent.futures.as_completed(futures):
file_path = futures[future]
try:
result = future.result()
results.append(result)
# Track this file as processed for resuming later if needed
processed_files.append(file_path)
# Update progress for successful or failed processing
pbar.update(1)
# Update our tracking of bad/repaired files in real-time for progress saving
file_path, is_valid, size, repair_status, repair_msg, dimensions = result
if repair_status == "repaired":
repaired_files.append(file_path)
elif not is_valid:
bad_files.append(file_path)
except Exception as e:
logging.error(f"Error processing {file_path}: {str(e)}")
pbar.update(1)
except KeyboardInterrupt:
# If the user interrupts, save progress before exiting
logging.warning("Process interrupted by user. Saving progress...")
save_progress(session_id, directory, formats, recursive,
processed_files, bad_files, repaired_files, progress_dir)
logging.info(f"Progress saved. You can resume with --resume {session_id}")
raise
# Process results
total_size_saved = 0
for file_path, is_valid, size, repair_status, repair_msg, dimensions in results:
if repair_status == "repaired":
# File was successfully repaired (already added to repaired_files during processing)
width, height = dimensions
msg = f"Repaired: {file_path} ({width}x{height}) - {repair_msg}"
logging.info(msg)
elif not is_valid:
# File is corrupt and wasn't repaired (or repair failed)
# (already added to bad_files during processing)
total_size_saved += size
size_str = humanize.naturalsize(size)
if repair_status == "repair_failed":
fail_msg = f"Repair failed: {file_path} ({size_str}) - {repair_msg}"
logging.warning(fail_msg)
if dry_run:
msg = f"Would delete: {file_path} ({size_str})"
logging.info(msg)
elif move_to:
# Preserve the subdirectory structure by getting the relative path from the search directory
try:
# Get the relative path from the base directory
rel_path = os.path.relpath(file_path, str(directory))
# If relpath starts with ".." it means file_path is not within directory
# In this case, just use the basename as fallback
if rel_path.startswith('..'):
rel_path = os.path.basename(file_path)
# Use safe path joining to prevent path traversal attacks
# This ensures files can't be written outside the move_to directory
try:
dest_path = safe_join_path(move_to, rel_path)
except ValueError as ve:
logging.error(f"Security error moving {file_path}: {ve}")
continue
# Create parent directories if they don't exist
os.makedirs(os.path.dirname(dest_path), exist_ok=True)
# Use shutil.move instead of os.rename to handle cross-device file movements
shutil.move(file_path, dest_path)
# Add arrow with color
arrow = f"{colorama.Fore.CYAN}{colorama.Style.RESET_ALL}"
msg = f"Moved: {file_path} {arrow} {dest_path} ({size_str})"
logging.info(msg)
except Exception as e:
logging.error(f"Failed to move {file_path}: {e}")
else:
try:
os.remove(file_path)
msg = f"Deleted: {file_path} ({size_str})"
logging.info(msg)
except Exception as e:
logging.error(f"Failed to delete {file_path}: {e}")
# Final progress save
save_progress(session_id, directory, formats, recursive,
processed_files, bad_files, repaired_files, progress_dir)
elapsed = time.time() - start_time
logging.info(f"Processed {len(processed_files)} files in {elapsed:.2f} seconds")
logging.info(f"Session ID: {session_id} (use --resume {session_id} to resume if needed)")
return bad_files, repaired_files, total_size_saved
def print_banner():
"""Print 2PAC-themed ASCII art banner"""
banner = r"""
░▒▓███████▓▒░░▒▓███████▓▒░ ░▒▓██████▓▒░ ░▒▓██████▓▒░
░▒▓█▓▒░▒▓█▓▒░░▒▓█▓▒░▒▓█▓▒░░▒▓█▓▒░▒▓█▓▒░░▒▓█▓▒░
░▒▓█▓▒░▒▓█▓▒░░▒▓█▓▒░▒▓█▓▒░░▒▓█▓▒░▒▓█▓▒░
░▒▓██████▓▒░░▒▓███████▓▒░░▒▓████████▓▒░▒▓█▓▒░
░▒▓█▓▒░ ░▒▓█▓▒░ ░▒▓█▓▒░░▒▓█▓▒░▒▓█▓▒░
░▒▓█▓▒░ ░▒▓█▓▒░ ░▒▓█▓▒░░▒▓█▓▒░▒▓█▓▒░░▒▓█▓▒░
░▒▓████████▓▒░▒▓█▓▒░ ░▒▓█▓▒░░▒▓█▓▒░░▒▓██████▓▒░
╔═════════════════════════════════════════════════════════╗
║ The Picture Analyzer & Corruption killer ║
║ In memory of Jeff Young - Bringing people together ║
╚═════════════════════════════════════════════════════════╝
"""
# Colored version of the banner, highlighting PAC for Picture Analyzer Corruption
if 'colorama' in sys.modules:
banner_lines = banner.strip().split('\n')
colored_banner = []
# Color the new gradient ASCII art logo (lines 0-6)
for i, line in enumerate(banner_lines):
if i < 7: # The ASCII art logo lines for the new gradient style
# For "2" part (first column)
part1 = line[:11]
# For "P" part (second column)
part2 = line[11:24]
# For "A" part (third column)
part3 = line[24:38]
# For "C" part (fourth column)
part4 = line[38:]
colored_line = f"{colorama.Fore.WHITE}{part1}" + \
f"{colorama.Fore.RED}{part2}" + \
f"{colorama.Fore.GREEN}{part3}" + \
f"{colorama.Fore.BLUE}{part4}{colorama.Style.RESET_ALL}"
colored_banner.append(colored_line)
elif i >= 7 and i <= 10: # The box and text lines
if i == 8: # Title line with PAC highlighted
parts = line.split("Picture Analyzer & Corruption")
if len(parts) == 2:
prefix = parts[0]
suffix = parts[1]
colored_title = f"{colorama.Fore.YELLOW}{prefix}" + \
f"{colorama.Fore.RED}Picture " + \
f"{colorama.Fore.GREEN}Analyzer " + \
f"{colorama.Fore.WHITE}& " + \
f"{colorama.Fore.BLUE}Corruption" + \
f"{colorama.Fore.YELLOW}{suffix}{colorama.Style.RESET_ALL}"
colored_banner.append(colored_title)
else:
colored_banner.append(f"{colorama.Fore.YELLOW}{line}{colorama.Style.RESET_ALL}")
elif i == 9: # Jeff Young tribute line
colored_banner.append(f"{colorama.Fore.CYAN}{line}{colorama.Style.RESET_ALL}")
else: # Box border lines
colored_banner.append(f"{colorama.Fore.YELLOW}{line}{colorama.Style.RESET_ALL}")
else:
colored_banner.append(f"{colorama.Fore.WHITE}{line}{colorama.Style.RESET_ALL}")
print('\n'.join(colored_banner))
else:
print(banner)
print()
def main():
print_banner()
# Check for 'q' command to quit
if len(sys.argv) == 2 and sys.argv[1].lower() == 'q':
print(f"{colorama.Fore.YELLOW}Exiting 2PAC. Stay safe!{colorama.Style.RESET_ALL}")
sys.exit(0)
parser = argparse.ArgumentParser(
description='2PAC: The Picture Analyzer & Corruption killer',
epilog='Created by Richard Young - "All Eyez On Your Images" - https://github.com/ricyoung/2pac'
)
# Main action (mutually exclusive)
action_group = parser.add_mutually_exclusive_group()
action_group.add_argument('directory', nargs='?', help='Directory to search for image files')
action_group.add_argument('--list-sessions', action='store_true', help='List all saved sessions')
action_group.add_argument('--check-file', type=str, help='Check a specific file for corruption (useful for testing)')
# Basic options
parser.add_argument('--delete', action='store_true', help='Delete corrupt image files (without this flag, runs in dry-run mode)')
parser.add_argument('--move-to', type=str, help='Move corrupt files to this directory instead of deleting them')
parser.add_argument('--workers', type=int, default=None, help='Number of worker processes (default: CPU count)')
parser.add_argument('--non-recursive', action='store_true', help='Only search in the specified directory, not subdirectories')
parser.add_argument('--output', type=str, help='Save list of corrupt files to this file')
parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose logging')
parser.add_argument('--no-color', action='store_true', help='Disable colored output')
parser.add_argument('--version', action='version', version=f'Bad Image Finder v{VERSION} by Richard Young')
# Repair options
repair_group = parser.add_argument_group('Repair options')
repair_group.add_argument('--repair', action='store_true', help='Attempt to repair corrupt image files')
repair_group.add_argument('--backup-dir', type=str, help='Directory to store backups of files before repair')
repair_group.add_argument('--repair-report', type=str, help='Save list of repaired files to this file')
# Format options
format_group = parser.add_argument_group('Image format options')
format_group.add_argument('--formats', type=str, nargs='+', choices=SUPPORTED_FORMATS.keys(),
help=f'Image formats to check (default: all formats)')
format_group.add_argument('--jpeg', action='store_true', help='Check JPEG files only')
format_group.add_argument('--png', action='store_true', help='Check PNG files only')
format_group.add_argument('--tiff', action='store_true', help='Check TIFF files only')
format_group.add_argument('--gif', action='store_true', help='Check GIF files only')
format_group.add_argument('--bmp', action='store_true', help='Check BMP files only')
# Validation options
validation_group = parser.add_argument_group('Validation options')
validation_group.add_argument('--thorough', action='store_true',
help='Perform thorough image validation (slower but catches more subtle corruption)')
validation_group.add_argument('--sensitivity', type=str, choices=['low', 'medium', 'high'], default='medium',
help='Set validation sensitivity level: low (basic checks), medium (standard checks), high (most strict)')
validation_group.add_argument('--ignore-eof', action='store_true',
help='Ignore missing end-of-file markers (useful for truncated but viewable files)')
validation_group.add_argument('--check-visual', action='store_true',
help='Analyze image content to detect visible corruption like gray/black areas')
validation_group.add_argument('--visual-strictness', type=str, choices=['low', 'medium', 'high'], default='medium',
help='Set strictness level for visual corruption detection: low (most permissive), medium (balanced), high (only clear corruption)')
# Security options
security_group = parser.add_argument_group('Security options')
security_group.add_argument('--security-checks', action='store_true',
help='Enable enhanced security validation (file size limits, dimension checks, format verification)')
security_group.add_argument('--max-file-size', type=int, default=MAX_FILE_SIZE,
help=f'Maximum file size in bytes to process (default: {MAX_FILE_SIZE} = 100MB)')
security_group.add_argument('--max-pixels', type=int, default=MAX_IMAGE_PIXELS,
help=f'Maximum image dimensions in pixels (default: {MAX_IMAGE_PIXELS} = 50MP)')
# Progress saving options
progress_group = parser.add_argument_group('Progress options')
progress_group.add_argument('--save-interval', type=int, default=5,
help='Save progress every N minutes (0 to disable progress saving)')
progress_group.add_argument('--progress-dir', type=str, default=DEFAULT_PROGRESS_DIR,
help='Directory to store progress files')
progress_group.add_argument('--resume', type=str, metavar='SESSION_ID',
help='Resume from a previously saved session')
args = parser.parse_args()
# Setup logging
setup_logging(args.verbose, args.no_color)
# Handle specific file check mode
if args.check_file:
file_path = args.check_file
if not os.path.exists(file_path):
logging.error(f"Error: File not found: {file_path}")
sys.exit(1)
print(f"\n{colorama.Style.BRIGHT}Checking file: {file_path}{colorama.Style.RESET_ALL}\n")
# Basic check
print(f"{colorama.Fore.CYAN}Basic validation:{colorama.Style.RESET_ALL}")
try:
with Image.open(file_path) as img:
print(f"✓ File can be opened by PIL")
print(f" Format: {img.format}")
print(f" Mode: {img.mode}")
print(f" Size: {img.size[0]}x{img.size[1]}")
try:
img.verify()
print(f"✓ Header verification passed")
except Exception as e:
print(f"❌ Header verification failed: {str(e)}")
try:
with Image.open(file_path) as img2:
img2.load()
print(f"✓ Data loading test passed")
except Exception as e:
print(f"❌ Data loading test failed: {str(e)}")
except Exception as e:
print(f"❌ Cannot open file with PIL: {str(e)}")
# Detailed format-specific checks
if file_path.lower().endswith(tuple(SUPPORTED_FORMATS['JPEG'])):
print(f"\n{colorama.Fore.CYAN}JPEG structure checks:{colorama.Style.RESET_ALL}")
is_valid, msg = check_jpeg_structure(file_path)
if is_valid:
print(f"✓ JPEG structure valid: {msg}")
else:
print(f"❌ JPEG structure invalid: {msg}")
elif file_path.lower().endswith(tuple(SUPPORTED_FORMATS['PNG'])):
print(f"\n{colorama.Fore.CYAN}PNG structure checks:{colorama.Style.RESET_ALL}")
is_valid, msg = check_png_structure(file_path)
if is_valid:
print(f"✓ PNG structure valid: {msg}")
else:
print(f"❌ PNG structure invalid: {msg}")
# Decode test
print(f"\n{colorama.Fore.CYAN}Full decode test:{colorama.Style.RESET_ALL}")
is_valid, msg = try_full_decode_check(file_path)
if is_valid:
print(f"✓ Full decode test passed: {msg}")
else:
print(f"❌ Full decode test failed: {msg}")
# External tools check
print(f"\n{colorama.Fore.CYAN}External tools check:{colorama.Style.RESET_ALL}")
is_valid, msg = try_external_tools(file_path)
if is_valid:
print(f"✓ External tools: {msg}")
else:
print(f"❌ External tools: {msg}")
# Visual corruption check
print(f"\n{colorama.Fore.CYAN}Visual content analysis:{colorama.Style.RESET_ALL}")
is_visually_corrupt, vis_msg = check_visual_corruption(file_path)
if not is_visually_corrupt:
print(f"✓ No visual corruption detected: {vis_msg}")
else:
print(f"❌ {vis_msg}")
# Final verdict
print(f"\n{colorama.Fore.CYAN}Final verdict:{colorama.Style.RESET_ALL}")
is_valid_basic = is_valid_image(file_path, thorough=False)
is_valid_thorough = is_valid_image(file_path, thorough=True)
is_valid_visual = not is_visually_corrupt
if is_valid_basic and is_valid_thorough and is_valid_visual:
print(f"{colorama.Fore.GREEN}This file appears to be valid by all checks.{colorama.Style.RESET_ALL}")
elif not is_valid_visual:
print(f"{colorama.Fore.RED}This file shows visible corruption in the image content.{colorama.Style.RESET_ALL}")
print(f"Recommendation: Use --check-visual to detect this type of corruption.")
elif is_valid_basic and not is_valid_thorough:
print(f"{colorama.Fore.YELLOW}This file passes basic validation but fails thorough checks.{colorama.Style.RESET_ALL}")
print(f"Recommendation: Use --thorough mode to detect this type of corruption.")
else:
print(f"{colorama.Fore.RED}This file is corrupt and would be detected by the basic scan.{colorama.Style.RESET_ALL}")
sys.exit(0)
# Handle session listing mode
if args.list_sessions:
sessions = list_saved_sessions(args.progress_dir)
if sessions:
print(f"\n{colorama.Style.BRIGHT}Saved Sessions:{colorama.Style.RESET_ALL}")
for i, session in enumerate(sessions):
ts = datetime.fromisoformat(session['timestamp']).strftime('%Y-%m-%d %H:%M:%S')
print(f"\n{colorama.Fore.CYAN}Session ID: {session['id']}{colorama.Style.RESET_ALL}")
print(f" Created: {ts}")
print(f" Directory: {session['directory']}")
print(f" Formats: {', '.join(session['formats'])}")
print(f" Progress: {session['processed_count']} files processed, "
f"{session['bad_count']} corrupt, {session['repaired_count']} repaired")
# Show resume command
resume_cmd = f"find_bad_images.py --resume {session['id']}"
if os.path.exists(session['directory']):
print(f" {colorama.Fore.GREEN}Resume command: {resume_cmd}{colorama.Style.RESET_ALL}")
else:
print(f" {colorama.Fore.YELLOW}Directory no longer exists, cannot resume{colorama.Style.RESET_ALL}")
else:
print("No saved sessions found.")
sys.exit(0)
# Check if directory is specified for a new scan
if not args.directory and not args.resume:
logging.error("Error: You must specify a directory to scan or use --resume to continue a session")
sys.exit(1)
# If we're resuming without a directory, load from previous session
directory = None
if args.resume and not args.directory:
progress = load_progress(args.resume, args.progress_dir)
if progress:
directory = Path(progress['directory'])
logging.info(f"Using directory from saved session: {directory}")
else:
logging.error(f"Could not load session {args.resume}")
sys.exit(1)
elif args.directory:
directory = Path(args.directory)
# Verify the directory exists
if not directory.exists() or not directory.is_dir():
logging.error(f"Error: {directory} is not a valid directory")
sys.exit(1)
# Check for incompatible options
if args.delete and args.move_to:
logging.error("Error: Cannot use both --delete and --move-to options")
sys.exit(1)
# Determine which formats to check
formats = []
if args.formats:
formats = args.formats
elif args.jpeg:
formats.append('JPEG')
elif args.png:
formats.append('PNG')
elif args.tiff:
formats.append('TIFF')
elif args.gif:
formats.append('GIF')
elif args.bmp:
formats.append('BMP')
else:
# Default: check all formats
formats = DEFAULT_FORMATS
dry_run = not (args.delete or args.move_to)
# Colorful mode indicators
if args.repair:
mode_str = f"{colorama.Fore.MAGENTA}REPAIR MODE{colorama.Style.RESET_ALL}: Attempting to fix corrupt files"
logging.info(mode_str)
repairable_formats = [fmt for fmt in formats if fmt in REPAIRABLE_FORMATS]
if repairable_formats:
logging.info(f"Repairable formats: {', '.join(repairable_formats)}")
else:
logging.warning("None of the selected formats support repair")
if dry_run:
mode_str = f"{colorama.Fore.YELLOW}DRY RUN MODE{colorama.Style.RESET_ALL}: No files will be deleted or moved"
logging.info(mode_str)
elif args.move_to:
mode_str = f"{colorama.Fore.BLUE}MOVE MODE{colorama.Style.RESET_ALL}: Corrupt files will be moved to {args.move_to}"
logging.info(mode_str)
else:
mode_str = f"{colorama.Fore.RED}DELETE MODE{colorama.Style.RESET_ALL}: Corrupt files will be permanently deleted"
logging.info(mode_str)
# Add progress saving info
if args.save_interval > 0:
save_interval_str = f"{colorama.Fore.CYAN}PROGRESS SAVING{colorama.Style.RESET_ALL}: Every {args.save_interval} minutes"
logging.info(save_interval_str)
else:
logging.info("Progress saving is disabled")
if args.resume:
resume_str = f"{colorama.Fore.CYAN}RESUMING{colorama.Style.RESET_ALL}: From session {args.resume}"
logging.info(resume_str)
if args.thorough:
thorough_str = f"{colorama.Fore.MAGENTA}THOROUGH MODE{colorama.Style.RESET_ALL}: Using deep validation checks (slower but more accurate)"
logging.info(thorough_str)
# Show sensitivity level
sensitivity_colors = {
'low': colorama.Fore.GREEN,
'medium': colorama.Fore.YELLOW,
'high': colorama.Fore.RED
}
sensitivity_color = sensitivity_colors.get(args.sensitivity, colorama.Fore.YELLOW)
sensitivity_str = f"{sensitivity_color}SENSITIVITY: {args.sensitivity.upper()}{colorama.Style.RESET_ALL}"
logging.info(sensitivity_str)
# Show EOF handling
if args.ignore_eof:
eof_str = f"{colorama.Fore.CYAN}IGNORING EOF MARKERS{colorama.Style.RESET_ALL}: Allowing truncated but viewable files"
logging.info(eof_str)
# Show visual corruption checking status
if args.check_visual:
strictness_color = {
'low': colorama.Fore.GREEN,
'medium': colorama.Fore.YELLOW,
'high': colorama.Fore.RED
}.get(args.visual_strictness, colorama.Fore.YELLOW)
visual_str = f"{colorama.Fore.MAGENTA}VISUAL CHECK{colorama.Style.RESET_ALL}: " + \
f"Analyzing image content (strictness: {strictness_color}{args.visual_strictness.upper()}{colorama.Style.RESET_ALL})"
logging.info(visual_str)
# Show security checks status
if args.security_checks:
security_str = f"{colorama.Fore.RED}SECURITY CHECKS ENABLED{colorama.Style.RESET_ALL}: " + \
f"Validating file sizes (max {humanize.naturalsize(MAX_FILE_SIZE)}), " + \
f"dimensions (max {MAX_IMAGE_PIXELS:,} pixels), and format integrity"
logging.info(security_str)
# Show which formats we're checking
format_list = ", ".join(formats)
logging.info(f"Checking image formats: {format_list}")
logging.info(f"Searching for corrupt image files in {directory}")
try:
bad_files, repaired_files, total_size_saved = process_images(
directory,
formats,
dry_run=dry_run,
repair=args.repair,
max_workers=args.workers,
recursive=not args.non_recursive,
move_to=args.move_to,
repair_dir=args.backup_dir,
save_progress_interval=args.save_interval,
resume_session=args.resume,
progress_dir=args.progress_dir,
thorough_check=args.thorough,
sensitivity=args.sensitivity,
ignore_eof=args.ignore_eof,
check_visual=args.check_visual,
visual_strictness=args.visual_strictness,
enable_security_checks=args.security_checks
)
# Colorful summary
count_color = colorama.Fore.RED if bad_files else colorama.Fore.GREEN
file_count = f"{count_color}{len(bad_files)}{colorama.Style.RESET_ALL}"
logging.info(f"Found {file_count} corrupt image files")
if args.repair:
repair_color = colorama.Fore.GREEN if repaired_files else colorama.Fore.YELLOW
repair_count = f"{repair_color}{len(repaired_files)}{colorama.Style.RESET_ALL}"
logging.info(f"Successfully repaired {repair_count} files")
if args.repair_report and repaired_files:
with open(args.repair_report, 'w') as f:
for file_path in repaired_files:
f.write(f"{file_path}\n")
logging.info(f"Saved list of repaired files to {args.repair_report}")
savings_str = humanize.naturalsize(total_size_saved)
savings_color = colorama.Fore.GREEN if total_size_saved > 0 else colorama.Fore.RESET
savings_msg = f"Total space savings: {savings_color}{savings_str}{colorama.Style.RESET_ALL}"
logging.info(savings_msg)
if not args.no_color:
# Add signature at the end of the run
signature = f"\n{colorama.Fore.CYAN}2PAC v{VERSION} by Richard Young{colorama.Style.RESET_ALL}"
quote = f"{colorama.Fore.YELLOW}\"{random.choice(QUOTES)}\"{colorama.Style.RESET_ALL}"
print(signature)
print(quote)
# Save list of corrupt files if requested
if args.output and bad_files:
with open(args.output, 'w') as f:
for file_path in bad_files:
f.write(f"{file_path}\n")
logging.info(f"Saved list of corrupt files to {args.output}")
if bad_files and dry_run:
logging.info("Run with --delete to remove these files or --move-to to relocate them")
except KeyboardInterrupt:
logging.info("Operation cancelled by user")
sys.exit(130)
except Exception as e:
logging.error(f"Error: {str(e)}")
if args.verbose:
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()