Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import numpy as np | |
| import cv2 | |
| import torch | |
| import pathlib | |
| import sys | |
| import json | |
| from PIL import Image | |
| from PIL.ExifTags import TAGS | |
| import matplotlib.pyplot as plt | |
| import matplotlib.patches as patches | |
| from typing import Dict, List, Tuple, Optional | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| # Add the agent module to path | |
| ROOT = pathlib.Path(__file__).resolve().parent | |
| sys.path.insert(0, str(ROOT / "goal2" / "src")) | |
| from agent import models, geometry, io | |
| # Device configuration | |
| DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| # Camera presets for common devices | |
| CAMERA_PRESETS = { | |
| "iPhone 12/13/14 (Main Camera)": {"fx": 1840, "fy": 1840, "description": "26mm equivalent, f/1.6"}, | |
| "iPhone 12/13/14 (Ultra Wide)": {"fx": 920, "fy": 920, "description": "13mm equivalent, f/2.4"}, | |
| "Samsung Galaxy S21/S22": {"fx": 1950, "fy": 1950, "description": "26mm equivalent"}, | |
| "Google Pixel 6/7": {"fx": 1800, "fy": 1800, "description": "27mm equivalent"}, | |
| "Generic Smartphone": {"fx": 1500, "fy": 1500, "description": "Typical smartphone camera"}, | |
| "Custom": {"fx": 1500, "fy": 1500, "description": "Enter your own focal length values"} | |
| } | |
| class SizeEstimatorApp: | |
| def __init__(self): | |
| self.depth_net = None | |
| self.mask_gen = None | |
| self.current_image = None | |
| self.current_depth = None | |
| self.current_masks = None | |
| self.reference_object = None | |
| def detect_camera_from_exif(self, image_pil: Image.Image) -> Tuple[str, Dict]: | |
| """Try to detect camera type from EXIF data""" | |
| try: | |
| exif = image_pil._getexif() | |
| if not exif: | |
| return "Unknown", {} | |
| # Extract relevant EXIF data | |
| exif_data = {} | |
| for tag_id, value in exif.items(): | |
| tag = TAGS.get(tag_id, tag_id) | |
| exif_data[tag] = value | |
| # Try to identify camera make/model | |
| make = exif_data.get('Make', '').lower() | |
| model = exif_data.get('Model', '').lower() | |
| # Match against known camera presets | |
| if 'apple' in make or 'iphone' in model: | |
| if any(x in model for x in ['12', '13', '14']): | |
| return "iPhone 12/13/14 (Main Camera)", exif_data | |
| else: | |
| return "Generic Smartphone", exif_data | |
| elif 'samsung' in make: | |
| return "Samsung Galaxy S21/S22", exif_data | |
| elif 'google' in make or 'pixel' in model: | |
| return "Google Pixel 6/7", exif_data | |
| else: | |
| return "Generic Smartphone", exif_data | |
| except Exception as e: | |
| print(f"EXIF detection failed: {e}") | |
| return "Unknown", {} | |
| def load_models(self): | |
| """Load the depth and segmentation models""" | |
| if self.depth_net is None: | |
| print("Loading Depth Anything V2...") | |
| self.depth_net = models.load_depth(DEVICE) | |
| if self.mask_gen is None: | |
| print("Loading SAM...") | |
| self.mask_gen = models.load_sam(DEVICE) | |
| return "β Models loaded successfully!" | |
| def process_image(self, image: np.ndarray, camera_preset: str, fx_custom: float, fy_custom: float) -> Tuple[np.ndarray, str]: | |
| """Process uploaded image and generate depth + segmentation""" | |
| try: | |
| # Input validation | |
| if image is None: | |
| return None, "β No image provided. Please upload an image." | |
| if len(image.shape) != 3 or image.shape[2] != 3: | |
| return None, "β Invalid image format. Please upload a color image (RGB)." | |
| # Check image size constraints | |
| h, w = image.shape[:2] | |
| if h < 100 or w < 100: | |
| return None, "β Image too small. Please upload an image at least 100x100 pixels." | |
| if h > 4000 or w > 4000: | |
| status_msg = "β οΈ Large image detected. Resizing for processing...\n" | |
| # Resize very large images | |
| max_size = 2000 | |
| scale = min(max_size/w, max_size/h) | |
| if scale < 1: | |
| new_w, new_h = int(w * scale), int(h * scale) | |
| image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_AREA) | |
| status_msg += f"π Resized from {w}Γ{h} to {new_w}Γ{new_h}\n" | |
| else: | |
| status_msg = "" | |
| # Ensure models are loaded | |
| if self.depth_net is None or self.mask_gen is None: | |
| self.load_models() | |
| # Store the original image | |
| self.current_image = image.copy() | |
| # Validate camera parameters | |
| if camera_preset == "Custom": | |
| if fx_custom <= 0 or fy_custom <= 0: | |
| return None, "β Invalid focal length values. Must be greater than 0." | |
| if fx_custom < 100 or fy_custom < 100 or fx_custom > 5000 or fy_custom > 5000: | |
| return None, "β Focal length values seem unrealistic. Typical range: 100-5000 pixels." | |
| fx, fy = fx_custom, fy_custom | |
| else: | |
| preset = CAMERA_PRESETS[camera_preset] | |
| fx, fy = preset["fx"], preset["fy"] | |
| # Generate depth and masks using the robust approach | |
| depth, masks, processed_img = models.predict_depth_and_masks( | |
| self.depth_net, self.mask_gen, image, DEVICE, approach="aligned" | |
| ) | |
| # Validate results | |
| if depth is None or len(depth.shape) != 2: | |
| return None, "β Failed to generate depth map. Please try a different image." | |
| if not masks or len(masks) == 0: | |
| return None, "β No objects detected in the image. Try an image with clearer objects." | |
| # Filter out very small masks (likely noise) | |
| min_area = (image.shape[0] * image.shape[1]) * 0.001 # 0.1% of image area | |
| filtered_masks = [m for m in masks if m['area'] > min_area] | |
| if len(filtered_masks) == 0: | |
| return None, "β No significant objects detected. Try an image with larger, clearer objects." | |
| self.current_depth = depth | |
| self.current_masks = filtered_masks | |
| # Create visualization | |
| vis_image = self.create_mask_visualization(processed_img, filtered_masks) | |
| status = status_msg + f"β Processed successfully! Found {len(filtered_masks)} objects.\n" | |
| status += f"π· Camera: {camera_preset} (fx={fx:.0f}, fy={fy:.0f})\n" | |
| status += f"πΌοΈ Image size: {image.shape[1]}Γ{image.shape[0]}\n" | |
| if len(masks) > len(filtered_masks): | |
| status += f"π Filtered out {len(masks) - len(filtered_masks)} small objects\n" | |
| status += f"π Ready for size estimation - select object number and known size below" | |
| return vis_image, status | |
| except Exception as e: | |
| import traceback | |
| error_details = traceback.format_exc() | |
| print("Full error:", error_details) # For debugging | |
| return None, f"β Error processing image: {str(e)}\nPlease try a different image." | |
| def create_mask_visualization(self, image: np.ndarray, masks: List[Dict]) -> np.ndarray: | |
| """Create visualization with colored masks and labels""" | |
| vis_img = image.copy() | |
| # Sort masks by area (largest first) | |
| sorted_masks = sorted(masks, key=lambda x: x['area'], reverse=True) | |
| # Color each mask with different colors | |
| colors = plt.cm.Set3(np.linspace(0, 1, len(sorted_masks))) | |
| for i, mask_data in enumerate(sorted_masks): | |
| mask = mask_data['segmentation'] | |
| color = colors[i][:3] # RGB values | |
| # Apply colored overlay | |
| colored_mask = np.zeros_like(vis_img) | |
| colored_mask[mask] = [int(c * 255) for c in color] | |
| vis_img = cv2.addWeighted(vis_img, 0.7, colored_mask, 0.3, 0) | |
| # Add number label | |
| y, x = np.where(mask) | |
| if len(x) > 0 and len(y) > 0: | |
| center_x, center_y = int(np.mean(x)), int(np.mean(y)) | |
| cv2.putText(vis_img, str(i+1), (center_x-10, center_y+5), | |
| cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2) | |
| cv2.putText(vis_img, str(i+1), (center_x-10, center_y+5), | |
| cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 0), 1) | |
| return vis_img | |
| def select_reference_object(self, mask_number: int, reference_size_cm: float, dimension: str) -> str: | |
| """Select a mask as reference object and specify its known size""" | |
| try: | |
| if self.current_masks is None: | |
| return "β No image processed yet. Please upload and process an image first." | |
| if mask_number < 1 or mask_number > len(self.current_masks): | |
| return f"β Invalid mask number. Choose between 1 and {len(self.current_masks)}" | |
| if reference_size_cm <= 0: | |
| return "β Reference size must be greater than 0" | |
| # Get the selected mask (convert to 0-based index) | |
| sorted_masks = sorted(self.current_masks, key=lambda x: x['area'], reverse=True) | |
| selected_mask = sorted_masks[mask_number - 1] | |
| # Store reference object info | |
| self.reference_object = { | |
| 'mask_data': selected_mask, | |
| 'known_size_cm': reference_size_cm, | |
| 'dimension': dimension # 'width' or 'height' | |
| } | |
| return f"β Reference object #{mask_number} selected!\nπ Known {dimension}: {reference_size_cm} cm" | |
| except Exception as e: | |
| return f"β Error selecting reference: {str(e)}" | |
| def calculate_all_sizes(self, camera_preset: str, fx_custom: float, fy_custom: float) -> str: | |
| """Calculate sizes of all objects using the reference object for scale""" | |
| try: | |
| if self.current_masks is None: | |
| return "β No image processed yet." | |
| if self.reference_object is None: | |
| return "β No reference object selected. Please select a reference object first." | |
| # Get camera parameters | |
| if camera_preset == "Custom": | |
| fx, fy = fx_custom, fy_custom | |
| else: | |
| preset = CAMERA_PRESETS[camera_preset] | |
| fx, fy = preset["fx"], preset["fy"] | |
| # Calculate reference object's pixel dimensions first | |
| ref_mask = self.reference_object['mask_data']['segmentation'] | |
| ref_stats = geometry.pixel_to_metric(ref_mask, self.current_depth, fx, fy) | |
| # Get the reference object's measured dimension in pixels | |
| if self.reference_object['dimension'] == 'width': | |
| ref_pixel_size = ref_stats['width_m'] * 100 # Convert to cm | |
| else: # height | |
| ref_pixel_size = ref_stats['height_m'] * 100 # Convert to cm | |
| # Calculate scale factor: known_size / measured_size | |
| scale_factor = self.reference_object['known_size_cm'] / ref_pixel_size | |
| # Calculate sizes for all objects | |
| results = [] | |
| sorted_masks = sorted(self.current_masks, key=lambda x: x['area'], reverse=True) | |
| for i, mask_data in enumerate(sorted_masks): | |
| mask = mask_data['segmentation'] | |
| stats = geometry.pixel_to_metric(mask, self.current_depth, fx, fy) | |
| # Apply scale correction | |
| corrected_width = stats['width_m'] * 100 * scale_factor # cm | |
| corrected_height = stats['height_m'] * 100 * scale_factor # cm | |
| corrected_distance = stats['distance_m'] * scale_factor # meters | |
| # Check if this is the reference object by comparing mask data | |
| is_reference = np.array_equal(mask_data['segmentation'], self.reference_object['mask_data']['segmentation']) | |
| ref_marker = " (REFERENCE)" if is_reference else "" | |
| results.append(f"Object #{i+1}{ref_marker}:") | |
| results.append(f" π Width: {corrected_width:.1f} cm") | |
| results.append(f" π Height: {corrected_height:.1f} cm") | |
| results.append(f" π Distance: {corrected_distance:.2f} m") | |
| results.append(f" π Area: {mask_data['area']} pixels") | |
| results.append("") | |
| # Find reference object number for display | |
| ref_object_num = None | |
| for i, mask_data in enumerate(sorted_masks): | |
| if np.array_equal(mask_data['segmentation'], self.reference_object['mask_data']['segmentation']): | |
| ref_object_num = i + 1 | |
| break | |
| # Add calibration info | |
| results.append("=" * 40) | |
| results.append("π Calibration Info:") | |
| results.append(f"π· Camera: {camera_preset}") | |
| results.append(f"π Scale factor: {scale_factor:.3f}") | |
| results.append(f"π Reference: Object #{ref_object_num if ref_object_num else 'Unknown'}") | |
| results.append(f"π Known {self.reference_object['dimension']}: {self.reference_object['known_size_cm']} cm") | |
| return "\n".join(results) | |
| except Exception as e: | |
| return f"β Error calculating sizes: {str(e)}" | |
| # Initialize the app | |
| app = SizeEstimatorApp() | |
| # Gradio interface | |
| def create_interface(): | |
| with gr.Blocks(title="π Smart Object Size Estimator", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # π Smart Object Size Estimator | |
| Upload an image and get real-world size measurements of objects using AI-powered depth estimation and segmentation. | |
| ## How to use: | |
| 1. **Upload an image** and select your camera type | |
| 2. **Click Process** to detect objects | |
| 3. **Select a reference object** by clicking its number and entering its known size | |
| 4. **Calculate sizes** to get measurements of all objects | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| # Input section | |
| gr.Markdown("### π€ Input") | |
| image_input = gr.Image(type="numpy", label="Upload Image") | |
| # Camera settings | |
| gr.Markdown("### π· Camera Settings") | |
| camera_preset = gr.Dropdown( | |
| choices=list(CAMERA_PRESETS.keys()), | |
| value="iPhone 12/13/14 (Main Camera)", | |
| label="Camera Type", | |
| info="Select your camera or choose 'Custom' for manual input" | |
| ) | |
| with gr.Row(): | |
| fx_custom = gr.Number(value=1500, label="Focal Length X (pixels)", visible=False) | |
| fy_custom = gr.Number(value=1500, label="Focal Length Y (pixels)", visible=False) | |
| process_btn = gr.Button("π Process Image", variant="primary", size="lg") | |
| # Reference object selection | |
| gr.Markdown("### π Reference Object") | |
| with gr.Row(): | |
| mask_number = gr.Number(value=1, label="Object Number", precision=0, minimum=1) | |
| reference_size = gr.Number(value=10.0, label="Known Size (cm)", minimum=0.1) | |
| dimension_choice = gr.Radio( | |
| choices=["width", "height"], | |
| value="width", | |
| label="Which dimension is the known size?" | |
| ) | |
| select_ref_btn = gr.Button("π Set as Reference", variant="secondary") | |
| calculate_btn = gr.Button("π Calculate All Sizes", variant="primary", size="lg") | |
| with gr.Column(scale=2): | |
| # Output section | |
| gr.Markdown("### πΌοΈ Results") | |
| image_output = gr.Image(label="Detected Objects") | |
| status_output = gr.Textbox(label="Status", lines=4, max_lines=10) | |
| results_output = gr.Textbox(label="Size Measurements", lines=15, max_lines=25) | |
| # Event handlers | |
| def toggle_custom_focal(preset): | |
| if preset == "Custom": | |
| return gr.update(visible=True), gr.update(visible=True) | |
| else: | |
| return gr.update(visible=False), gr.update(visible=False) | |
| camera_preset.change( | |
| toggle_custom_focal, | |
| inputs=[camera_preset], | |
| outputs=[fx_custom, fy_custom] | |
| ) | |
| # Load models on startup | |
| demo.load(app.load_models, outputs=[status_output]) | |
| process_btn.click( | |
| app.process_image, | |
| inputs=[image_input, camera_preset, fx_custom, fy_custom], | |
| outputs=[image_output, status_output] | |
| ) | |
| select_ref_btn.click( | |
| app.select_reference_object, | |
| inputs=[mask_number, reference_size, dimension_choice], | |
| outputs=[status_output] | |
| ) | |
| calculate_btn.click( | |
| app.calculate_all_sizes, | |
| inputs=[camera_preset, fx_custom, fy_custom], | |
| outputs=[results_output] | |
| ) | |
| # Additional controls and info | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### π― Quick Actions") | |
| clear_btn = gr.Button("ποΈ Clear All", variant="secondary") | |
| with gr.Column(): | |
| gr.Markdown("### π Session Info") | |
| session_info = gr.Textbox(label="Current Session", value="No image processed", interactive=False) | |
| # Event handlers for additional features | |
| def clear_session(): | |
| app.current_image = None | |
| app.current_depth = None | |
| app.current_masks = None | |
| app.reference_object = None | |
| return ( | |
| None, # image_output | |
| "ποΈ Session cleared. Upload a new image to start.", # status_output | |
| "", # results_output | |
| "No image processed" # session_info | |
| ) | |
| def update_session_info(camera_preset, fx_custom, fy_custom): | |
| if app.current_masks is None: | |
| return "No image processed" | |
| if camera_preset == "Custom": | |
| cam_info = f"Custom (fx={fx_custom:.0f}, fy={fy_custom:.0f})" | |
| else: | |
| cam_info = camera_preset | |
| ref_info = "None selected" | |
| if app.reference_object: | |
| ref_info = f"Object with {app.reference_object['known_size_cm']} cm {app.reference_object['dimension']}" | |
| return f"π· Camera: {cam_info}\nπ Reference: {ref_info}\nπ― Objects: {len(app.current_masks)}" | |
| clear_btn.click( | |
| clear_session, | |
| outputs=[image_output, status_output, results_output, session_info] | |
| ) | |
| # Update session info when things change | |
| for component in [camera_preset, fx_custom, fy_custom]: | |
| component.change( | |
| update_session_info, | |
| inputs=[camera_preset, fx_custom, fy_custom], | |
| outputs=[session_info] | |
| ) | |
| gr.Markdown(""" | |
| ### π‘ Tips for best results: | |
| - Use good lighting and avoid shadows | |
| - Ensure objects are clearly visible and separated | |
| - Choose a reference object you know the exact size of | |
| - For phones, try the camera-specific presets first | |
| - Custom focal lengths can be calibrated using camera calibration tools | |
| """) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = create_interface() | |
| demo.launch(share=True, debug=True) |