1.1.0
Browse files- app.py +129 -124
- depth_pro/utils.py +114 -0
- requirements.txt +4 -1
app.py
CHANGED
|
@@ -2,14 +2,20 @@ import os
|
|
| 2 |
import tempfile
|
| 3 |
import numpy as np
|
| 4 |
import cv2
|
|
|
|
|
|
|
|
|
|
| 5 |
import torch
|
| 6 |
from PIL import Image
|
| 7 |
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
|
| 8 |
from fastapi.responses import JSONResponse, HTMLResponse
|
| 9 |
-
from
|
| 10 |
-
|
| 11 |
import json
|
| 12 |
|
|
|
|
|
|
|
|
|
|
| 13 |
# Initialize FastAPI app
|
| 14 |
app = FastAPI(
|
| 15 |
title="Depth Pro Distance Estimation",
|
|
@@ -26,150 +32,120 @@ def initialize_depth_pipeline():
|
|
| 26 |
"""Initialize the Depth Pro pipeline"""
|
| 27 |
try:
|
| 28 |
print("Initializing Depth Pro pipeline...")
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
torch_dtype=torch.float32 # Use float32 for CPU compatibility
|
| 34 |
-
)
|
| 35 |
-
print("Depth Pro pipeline initialized successfully!")
|
| 36 |
-
return pipe
|
| 37 |
except Exception as e:
|
| 38 |
print(f"Error initializing pipeline: {e}")
|
| 39 |
print("Falling back to dummy pipeline...")
|
| 40 |
return None
|
| 41 |
|
| 42 |
-
class DummyDepthPipeline:
|
| 43 |
-
"""Dummy pipeline for when the real model fails to load"""
|
| 44 |
-
|
| 45 |
-
def __call__(self, image):
|
| 46 |
-
"""Generate dummy depth prediction"""
|
| 47 |
-
if isinstance(image, str):
|
| 48 |
-
image = Image.open(image)
|
| 49 |
-
elif isinstance(image, np.ndarray):
|
| 50 |
-
image = Image.fromarray(image)
|
| 51 |
-
|
| 52 |
-
width, height = image.size
|
| 53 |
-
|
| 54 |
-
# Generate a realistic-looking depth map
|
| 55 |
-
depth = self._generate_dummy_depth(height, width)
|
| 56 |
-
|
| 57 |
-
return {"depth": depth}
|
| 58 |
-
|
| 59 |
-
def _generate_dummy_depth(self, height, width):
|
| 60 |
-
"""Generate a dummy depth map that looks realistic"""
|
| 61 |
-
# Create depth that decreases from bottom to top (simulating perspective)
|
| 62 |
-
y_coords = np.linspace(10.0, 2.0, height) # 10m to 2m depth
|
| 63 |
-
depth = np.tile(y_coords[:, np.newaxis], (1, width))
|
| 64 |
-
|
| 65 |
-
# Add some noise and variation
|
| 66 |
-
noise = np.random.normal(0, 0.5, (height, width))
|
| 67 |
-
depth += noise
|
| 68 |
-
|
| 69 |
-
# Ensure positive depths
|
| 70 |
-
depth = np.maximum(depth, 0.1)
|
| 71 |
-
|
| 72 |
-
return depth
|
| 73 |
|
| 74 |
class DepthEstimator:
|
| 75 |
-
def __init__(self,
|
| 76 |
self.device = torch.device('cpu') # Force CPU
|
| 77 |
print("Initializing Depth Pro estimator...")
|
| 78 |
-
self.
|
|
|
|
| 79 |
print("Depth Pro estimator initialized successfully!")
|
| 80 |
|
| 81 |
def estimate_depth(self, image_path):
|
| 82 |
try:
|
| 83 |
# Load image
|
| 84 |
-
image = Image.open(image_path)
|
| 85 |
|
| 86 |
# Resize image for processing
|
| 87 |
resized_image, new_size = self.resize_image(image)
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
#
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
# Convert to numpy if needed
|
| 101 |
if isinstance(depth, torch.Tensor):
|
| 102 |
-
depth = depth.cpu().numpy()
|
| 103 |
elif not isinstance(depth, np.ndarray):
|
| 104 |
depth = np.array(depth)
|
| 105 |
|
| 106 |
# Estimate focal length (rough estimation)
|
| 107 |
-
|
|
|
|
| 108 |
|
| 109 |
-
return depth, new_size,
|
| 110 |
|
| 111 |
except Exception as e:
|
| 112 |
print(f"Error in depth estimation: {e}")
|
| 113 |
return None, None, None
|
| 114 |
|
| 115 |
-
def resize_image(self,
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
return resized_image, new_size
|
| 125 |
-
|
| 126 |
-
def find_topmost_pixel(image):
|
| 127 |
-
"""Find the topmost non-zero pixel in the image (simulating footpath detection)"""
|
| 128 |
-
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
| 129 |
-
# Simple edge detection to find potential footpath boundaries
|
| 130 |
-
edges = cv2.Canny(gray, 50, 150)
|
| 131 |
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
|
|
|
|
|
|
| 135 |
return None
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
top_x_coords = edge_pixels[1][top_pixels_mask]
|
| 140 |
center_idx = len(top_x_coords) // 2
|
| 141 |
return (min_y, top_x_coords[center_idx])
|
| 142 |
|
| 143 |
-
def
|
| 144 |
-
"""Find the bottommost pixel
|
| 145 |
if topmost_pixel is None:
|
| 146 |
return None
|
| 147 |
|
| 148 |
-
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
| 149 |
-
edges = cv2.Canny(gray, 50, 150)
|
| 150 |
-
|
| 151 |
top_y, top_x = topmost_pixel
|
| 152 |
|
| 153 |
-
# Find pixels in the same column
|
| 154 |
-
column_pixels = np.where((
|
| 155 |
|
| 156 |
if len(column_pixels[0]) == 0:
|
| 157 |
-
#
|
| 158 |
-
|
| 159 |
-
if len(
|
| 160 |
return None
|
| 161 |
-
max_y = np.max(
|
| 162 |
-
bottom_pixels_mask =
|
| 163 |
-
bottom_x_coords =
|
| 164 |
center_idx = len(bottom_x_coords) // 2
|
| 165 |
return (max_y, bottom_x_coords[center_idx])
|
| 166 |
|
|
|
|
| 167 |
max_y_in_column = np.max(column_pixels[0])
|
| 168 |
return (max_y_in_column, top_x)
|
| 169 |
|
| 170 |
-
|
|
|
|
| 171 |
"""Estimate real-world distance between two pixels using depth information"""
|
| 172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
return None
|
| 174 |
|
| 175 |
top_y, top_x = topmost_pixel
|
|
@@ -188,7 +164,7 @@ def estimate_real_world_distance(depth_map, topmost_pixel, bottommost_pixel):
|
|
| 188 |
print("Invalid depth values (NaN) found")
|
| 189 |
return None
|
| 190 |
|
| 191 |
-
distance_meters = float(
|
| 192 |
|
| 193 |
print(f"Distance calculation:")
|
| 194 |
print(f" Topmost pixel: ({top_y}, {top_x}) = {topmost_depth:.3f}m")
|
|
@@ -197,10 +173,14 @@ def estimate_real_world_distance(depth_map, topmost_pixel, bottommost_pixel):
|
|
| 197 |
|
| 198 |
return distance_meters
|
| 199 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
# Initialize depth estimator globally
|
| 201 |
print("Initializing Depth Pro pipeline...")
|
| 202 |
-
|
| 203 |
-
depth_estimator = DepthEstimator(
|
| 204 |
|
| 205 |
@app.get("/health")
|
| 206 |
async def health_check():
|
|
@@ -218,7 +198,7 @@ async def api_info():
|
|
| 218 |
}
|
| 219 |
|
| 220 |
@app.post("/estimate-depth")
|
| 221 |
-
async def estimate_depth_endpoint(file: UploadFile = File(...)):
|
| 222 |
"""FastAPI endpoint for depth estimation and distance calculation"""
|
| 223 |
try:
|
| 224 |
# Save uploaded file temporarily
|
|
@@ -226,13 +206,20 @@ async def estimate_depth_endpoint(file: UploadFile = File(...)):
|
|
| 226 |
content = await file.read()
|
| 227 |
temp_file.write(content)
|
| 228 |
temp_file_path = temp_file.name
|
| 229 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
# Load image for pixel detection
|
| 231 |
image = cv2.imread(temp_file_path)
|
| 232 |
-
|
|
|
|
| 233 |
return JSONResponse(
|
| 234 |
status_code=400,
|
| 235 |
-
content={"error": "Could not load image"}
|
| 236 |
)
|
| 237 |
|
| 238 |
# Estimate depth
|
|
@@ -249,19 +236,18 @@ async def estimate_depth_endpoint(file: UploadFile = File(...)):
|
|
| 249 |
|
| 250 |
# Find key pixels
|
| 251 |
topmost_pixel = find_topmost_pixel(resized_image)
|
| 252 |
-
bottommost_pixel = find_bottommost_pixel(resized_image, topmost_pixel)
|
| 253 |
|
| 254 |
# Calculate distance
|
| 255 |
-
distance_meters = estimate_real_world_distance(depth_map, topmost_pixel,
|
| 256 |
|
| 257 |
# Clean up
|
| 258 |
os.unlink(temp_file_path)
|
|
|
|
| 259 |
|
| 260 |
result = {
|
| 261 |
"depth_map_shape": depth_map.shape,
|
| 262 |
"focal_length_px": float(focal_length_px) if focal_length_px is not None else None,
|
| 263 |
-
"topmost_pixel": [int(topmost_pixel[0]), int(topmost_pixel[1])] if topmost_pixel else None,
|
| 264 |
-
"bottommost_pixel": [int(bottommost_pixel[0]), int(bottommost_pixel[1])] if bottommost_pixel else None,
|
| 265 |
"distance_meters": distance_meters,
|
| 266 |
"depth_stats": {
|
| 267 |
"min_depth": float(np.min(depth_map)),
|
|
@@ -325,11 +311,20 @@ async def root():
|
|
| 325 |
background-color: #ecf0f1;
|
| 326 |
}
|
| 327 |
input[type="file"] {
|
| 328 |
-
margin:
|
| 329 |
padding: 10px;
|
| 330 |
border: 1px solid #bdc3c7;
|
| 331 |
border-radius: 5px;
|
| 332 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
button {
|
| 334 |
background-color: #3498db;
|
| 335 |
color: white;
|
|
@@ -373,14 +368,20 @@ async def root():
|
|
| 373 |
<body>
|
| 374 |
<div class="container">
|
| 375 |
<h1>🔍 Depth Pro Distance Estimation</h1>
|
| 376 |
-
<p class="subtitle">Upload an image to estimate depth and calculate distances using Apple's Depth Pro model</p>
|
| 377 |
|
| 378 |
<div class="upload-section">
|
| 379 |
-
<h3>Upload Image</h3>
|
| 380 |
<form id="uploadForm" enctype="multipart/form-data">
|
| 381 |
-
<
|
| 382 |
-
|
| 383 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
</form>
|
| 385 |
|
| 386 |
<div id="results" class="results">
|
|
@@ -391,7 +392,7 @@ async def root():
|
|
| 391 |
|
| 392 |
<div class="endpoint-info">
|
| 393 |
<h3>🔗 API Endpoints</h3>
|
| 394 |
-
<p><strong>POST /estimate-depth</strong> - Upload image for depth estimation</p>
|
| 395 |
<p><strong>GET /docs</strong> - API documentation</p>
|
| 396 |
<p><strong>GET /health</strong> - Health check</p>
|
| 397 |
</div>
|
|
@@ -400,7 +401,8 @@ async def root():
|
|
| 400 |
<h3>✨ Features</h3>
|
| 401 |
<ul>
|
| 402 |
<li>🎯 Monocular depth estimation using Depth Pro</li>
|
| 403 |
-
<li
|
|
|
|
| 404 |
<li>🖥️ CPU-optimized processing</li>
|
| 405 |
<li>🚀 Fast inference suitable for real-time use</li>
|
| 406 |
</ul>
|
|
@@ -412,19 +414,26 @@ async def root():
|
|
| 412 |
e.preventDefault();
|
| 413 |
|
| 414 |
const fileInput = document.getElementById('imageFile');
|
|
|
|
| 415 |
const resultsDiv = document.getElementById('results');
|
| 416 |
const resultsContent = document.getElementById('resultsContent');
|
| 417 |
|
| 418 |
if (!fileInput.files[0]) {
|
| 419 |
-
alert('Please select
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 420 |
return;
|
| 421 |
}
|
| 422 |
|
| 423 |
const formData = new FormData();
|
| 424 |
formData.append('file', fileInput.files[0]);
|
|
|
|
| 425 |
|
| 426 |
try {
|
| 427 |
-
resultsContent.innerHTML = '<p>🔄 Processing image...</p>';
|
| 428 |
resultsDiv.style.display = 'block';
|
| 429 |
|
| 430 |
const response = await fetch('/estimate-depth', {
|
|
@@ -439,11 +448,10 @@ async def root():
|
|
| 439 |
html += `<p><strong>📐 Distance:</strong> ${result.distance_meters ? result.distance_meters.toFixed(3) + ' meters' : 'N/A'}</p>`;
|
| 440 |
html += `<p><strong>🎯 Focal Length:</strong> ${result.focal_length_px ? result.focal_length_px.toFixed(2) + ' pixels' : 'N/A'}</p>`;
|
| 441 |
html += `<p><strong>📊 Depth Map Shape:</strong> ${result.depth_map_shape ? result.depth_map_shape.join(' x ') : 'N/A'}</p>`;
|
| 442 |
-
html += `<p><strong>🔝 Top Pixel:</strong> ${result.topmost_pixel ? `(${result.topmost_pixel[0]}, ${result.topmost_pixel[1]})` : 'N/A'}</p>`;
|
| 443 |
-
html += `<p><strong>🔽 Bottom Pixel:</strong> ${result.bottommost_pixel ? `(${result.bottommost_pixel[0]}, ${result.bottommost_pixel[1]})` : 'N/A'}</p>`;
|
| 444 |
|
| 445 |
if (result.depth_stats) {
|
| 446 |
-
html += '<h4
|
| 447 |
html += `<p><strong>Min Depth:</strong> ${result.depth_stats.min_depth.toFixed(3)}m</p>`;
|
| 448 |
html += `<p><strong>Max Depth:</strong> ${result.depth_stats.max_depth.toFixed(3)}m</p>`;
|
| 449 |
html += `<p><strong>Mean Depth:</strong> ${result.depth_stats.mean_depth.toFixed(3)}m</p>`;
|
|
@@ -464,9 +472,6 @@ async def root():
|
|
| 464 |
"""
|
| 465 |
return HTMLResponse(content=html_content)
|
| 466 |
|
| 467 |
-
def gradio_interface(image):
|
| 468 |
-
"""Removed Gradio interface - keeping for backward compatibility"""
|
| 469 |
-
return "Gradio interface has been removed. Please use the web interface or API.", None
|
| 470 |
|
| 471 |
# FastAPI app is ready to run
|
| 472 |
if __name__ == "__main__":
|
|
|
|
| 2 |
import tempfile
|
| 3 |
import numpy as np
|
| 4 |
import cv2
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
import logging
|
| 7 |
+
from transformers import DepthProImageProcessorFast, DepthProForDepthEstimation
|
| 8 |
import torch
|
| 9 |
from PIL import Image
|
| 10 |
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
|
| 11 |
from fastapi.responses import JSONResponse, HTMLResponse
|
| 12 |
+
from typing import Any, Dict, List, Tuple, Union
|
| 13 |
+
import pillow_heif
|
| 14 |
import json
|
| 15 |
|
| 16 |
+
from depth_pro.utils import load_rgb, extract_exif
|
| 17 |
+
|
| 18 |
+
|
| 19 |
# Initialize FastAPI app
|
| 20 |
app = FastAPI(
|
| 21 |
title="Depth Pro Distance Estimation",
|
|
|
|
| 32 |
"""Initialize the Depth Pro pipeline"""
|
| 33 |
try:
|
| 34 |
print("Initializing Depth Pro pipeline...")
|
| 35 |
+
image_processor = DepthProImageProcessorFast.from_pretrained("apple/DepthPro-hf")
|
| 36 |
+
model = DepthProForDepthEstimation.from_pretrained("apple/DepthPro-hf").to(device)
|
| 37 |
+
|
| 38 |
+
return model, image_processor
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
except Exception as e:
|
| 40 |
print(f"Error initializing pipeline: {e}")
|
| 41 |
print("Falling back to dummy pipeline...")
|
| 42 |
return None
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
class DepthEstimator:
|
| 46 |
+
def __init__(self, model=None, image_processor=None):
|
| 47 |
self.device = torch.device('cpu') # Force CPU
|
| 48 |
print("Initializing Depth Pro estimator...")
|
| 49 |
+
self.model = model
|
| 50 |
+
self.image_processor = image_processor
|
| 51 |
print("Depth Pro estimator initialized successfully!")
|
| 52 |
|
| 53 |
def estimate_depth(self, image_path):
|
| 54 |
try:
|
| 55 |
# Load image
|
| 56 |
+
image = Image.open(image_path)
|
| 57 |
|
| 58 |
# Resize image for processing
|
| 59 |
resized_image, new_size = self.resize_image(image)
|
| 60 |
+
|
| 61 |
+
rgb_image = load_rgb(resized_image.name)
|
| 62 |
+
f_px = rgb_image[-1]
|
| 63 |
+
eval_image = rgb_image[0]
|
| 64 |
+
# Perform inference using model
|
| 65 |
+
inputs = self.image_processor(eval_image, return_tensors="pt").to(self.device)
|
| 66 |
+
with torch.no_grad():
|
| 67 |
+
outputs = self.model(**inputs)
|
| 68 |
+
post_processed_output = self.image_processor.post_process_depth_estimation(
|
| 69 |
+
outputs, target_sizes=[(new_size[1], new_size[0])],
|
| 70 |
+
)
|
| 71 |
+
result = post_processed_output[0]
|
| 72 |
+
field_of_view = result["field_of_view"]
|
| 73 |
+
focal_length = result["focal_length"]
|
| 74 |
+
depth = result["predicted_depth"]
|
| 75 |
+
|
| 76 |
# Convert to numpy if needed
|
| 77 |
if isinstance(depth, torch.Tensor):
|
| 78 |
+
depth = depth.detach().cpu().numpy()
|
| 79 |
elif not isinstance(depth, np.ndarray):
|
| 80 |
depth = np.array(depth)
|
| 81 |
|
| 82 |
# Estimate focal length (rough estimation)
|
| 83 |
+
print(f_px,focal_length)
|
| 84 |
+
|
| 85 |
|
| 86 |
+
return depth, new_size, focal_length
|
| 87 |
|
| 88 |
except Exception as e:
|
| 89 |
print(f"Error in depth estimation: {e}")
|
| 90 |
return None, None, None
|
| 91 |
|
| 92 |
+
def resize_image(self, image_path, max_size=1536):
|
| 93 |
+
with Image.open(image_path) as img:
|
| 94 |
+
ratio = max_size / max(img.size)
|
| 95 |
+
new_size = (int(img.size[0] * ratio), int(img.size[1] * ratio))
|
| 96 |
+
img = img.resize(new_size, Image.Resampling.LANCZOS)
|
| 97 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
|
| 98 |
+
img.save(temp_file, format="PNG")
|
| 99 |
+
return temp_file, new_size
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
+
|
| 102 |
+
def find_topmost_pixel(mask):
|
| 103 |
+
'''Top Pixel from footpath mask'''
|
| 104 |
+
footpath_pixels = np.where(mask > 0)
|
| 105 |
+
if len(footpath_pixels[0]) == 0:
|
| 106 |
return None
|
| 107 |
+
min_y = np.min(footpath_pixels[0])
|
| 108 |
+
top_pixels_mask = footpath_pixels[0] == min_y
|
| 109 |
+
top_x_coords = footpath_pixels[1][top_pixels_mask]
|
|
|
|
| 110 |
center_idx = len(top_x_coords) // 2
|
| 111 |
return (min_y, top_x_coords[center_idx])
|
| 112 |
|
| 113 |
+
def find_bottommost_footpath_pixel(mask, topmost_pixel):
|
| 114 |
+
"""Find the bottommost pixel perpendicular to the topmost pixel within the mask"""
|
| 115 |
if topmost_pixel is None:
|
| 116 |
return None
|
| 117 |
|
|
|
|
|
|
|
|
|
|
| 118 |
top_y, top_x = topmost_pixel
|
| 119 |
|
| 120 |
+
# Find all mask pixels in the same x-column as the topmost pixel
|
| 121 |
+
column_pixels = np.where((mask > 0) & (np.arange(mask.shape[1])[None, :] == top_x))
|
| 122 |
|
| 123 |
if len(column_pixels[0]) == 0:
|
| 124 |
+
# If no pixels in the same column, find the bottommost pixel in the entire mask
|
| 125 |
+
footpath_pixels = np.where(mask > 0)
|
| 126 |
+
if len(footpath_pixels[0]) == 0:
|
| 127 |
return None
|
| 128 |
+
max_y = np.max(footpath_pixels[0])
|
| 129 |
+
bottom_pixels_mask = footpath_pixels[0] == max_y
|
| 130 |
+
bottom_x_coords = footpath_pixels[1][bottom_pixels_mask]
|
| 131 |
center_idx = len(bottom_x_coords) // 2
|
| 132 |
return (max_y, bottom_x_coords[center_idx])
|
| 133 |
|
| 134 |
+
# Find the bottommost pixel in the same x-column
|
| 135 |
max_y_in_column = np.max(column_pixels[0])
|
| 136 |
return (max_y_in_column, top_x)
|
| 137 |
|
| 138 |
+
|
| 139 |
+
def estimate_real_world_distance(depth_map, topmost_pixel, mask):
|
| 140 |
"""Estimate real-world distance between two pixels using depth information"""
|
| 141 |
+
|
| 142 |
+
if topmost_pixel is None or depth_map is None:
|
| 143 |
+
return None
|
| 144 |
+
|
| 145 |
+
# Find the bottommost pixel perpendicular to the topmost pixel
|
| 146 |
+
bottommost_pixel = find_bottommost_footpath_pixel(mask, topmost_pixel)
|
| 147 |
+
|
| 148 |
+
if bottommost_pixel is None:
|
| 149 |
return None
|
| 150 |
|
| 151 |
top_y, top_x = topmost_pixel
|
|
|
|
| 164 |
print("Invalid depth values (NaN) found")
|
| 165 |
return None
|
| 166 |
|
| 167 |
+
distance_meters = float(topmost_depth - bottommost_depth)
|
| 168 |
|
| 169 |
print(f"Distance calculation:")
|
| 170 |
print(f" Topmost pixel: ({top_y}, {top_x}) = {topmost_depth:.3f}m")
|
|
|
|
| 173 |
|
| 174 |
return distance_meters
|
| 175 |
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
|
| 180 |
# Initialize depth estimator globally
|
| 181 |
print("Initializing Depth Pro pipeline...")
|
| 182 |
+
depth_model, image_processor = initialize_depth_pipeline()
|
| 183 |
+
depth_estimator = DepthEstimator(depth_model, image_processor)
|
| 184 |
|
| 185 |
@app.get("/health")
|
| 186 |
async def health_check():
|
|
|
|
| 198 |
}
|
| 199 |
|
| 200 |
@app.post("/estimate-depth")
|
| 201 |
+
async def estimate_depth_endpoint(file: UploadFile = File(...), mask: UploadFile = File(...)):
|
| 202 |
"""FastAPI endpoint for depth estimation and distance calculation"""
|
| 203 |
try:
|
| 204 |
# Save uploaded file temporarily
|
|
|
|
| 206 |
content = await file.read()
|
| 207 |
temp_file.write(content)
|
| 208 |
temp_file_path = temp_file.name
|
| 209 |
+
|
| 210 |
+
# Save uploaded mask temporarily
|
| 211 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as mtemp_file:
|
| 212 |
+
content = await mask.read()
|
| 213 |
+
mtemp_file.write(content)
|
| 214 |
+
temp_file_path_mask = mtemp_file.name
|
| 215 |
+
|
| 216 |
# Load image for pixel detection
|
| 217 |
image = cv2.imread(temp_file_path)
|
| 218 |
+
mask = cv2.imread(temp_file_path_mask)
|
| 219 |
+
if image is None or mask is None:
|
| 220 |
return JSONResponse(
|
| 221 |
status_code=400,
|
| 222 |
+
content={"error": "Could not load image or mask"}
|
| 223 |
)
|
| 224 |
|
| 225 |
# Estimate depth
|
|
|
|
| 236 |
|
| 237 |
# Find key pixels
|
| 238 |
topmost_pixel = find_topmost_pixel(resized_image)
|
|
|
|
| 239 |
|
| 240 |
# Calculate distance
|
| 241 |
+
distance_meters = estimate_real_world_distance(depth_map, topmost_pixel, mask)
|
| 242 |
|
| 243 |
# Clean up
|
| 244 |
os.unlink(temp_file_path)
|
| 245 |
+
os.unlink(temp_file_path_mask)
|
| 246 |
|
| 247 |
result = {
|
| 248 |
"depth_map_shape": depth_map.shape,
|
| 249 |
"focal_length_px": float(focal_length_px) if focal_length_px is not None else None,
|
| 250 |
+
"topmost_pixel": [ int(topmost_pixel[0]), int(topmost_pixel[1])] if topmost_pixel else None,
|
|
|
|
| 251 |
"distance_meters": distance_meters,
|
| 252 |
"depth_stats": {
|
| 253 |
"min_depth": float(np.min(depth_map)),
|
|
|
|
| 311 |
background-color: #ecf0f1;
|
| 312 |
}
|
| 313 |
input[type="file"] {
|
| 314 |
+
margin: 10px 0;
|
| 315 |
padding: 10px;
|
| 316 |
border: 1px solid #bdc3c7;
|
| 317 |
border-radius: 5px;
|
| 318 |
}
|
| 319 |
+
.file-group {
|
| 320 |
+
margin: 20px 0;
|
| 321 |
+
}
|
| 322 |
+
.file-label {
|
| 323 |
+
display: block;
|
| 324 |
+
margin-bottom: 8px;
|
| 325 |
+
font-weight: bold;
|
| 326 |
+
color: #2c3e50;
|
| 327 |
+
}
|
| 328 |
button {
|
| 329 |
background-color: #3498db;
|
| 330 |
color: white;
|
|
|
|
| 368 |
<body>
|
| 369 |
<div class="container">
|
| 370 |
<h1>🔍 Depth Pro Distance Estimation</h1>
|
| 371 |
+
<p class="subtitle">Upload an image and a footpath mask to estimate depth and calculate distances using Apple's Depth Pro model</p>
|
| 372 |
|
| 373 |
<div class="upload-section">
|
| 374 |
+
<h3>Upload Image and Mask</h3>
|
| 375 |
<form id="uploadForm" enctype="multipart/form-data">
|
| 376 |
+
<div style="margin: 20px 0;">
|
| 377 |
+
<label for="imageFile" style="display: block; margin-bottom: 5px; font-weight: bold;">📸 Main Image:</label>
|
| 378 |
+
<input type="file" id="imageFile" name="file" accept="image/*" required style="width: 100%;">
|
| 379 |
+
</div>
|
| 380 |
+
<div style="margin: 20px 0;">
|
| 381 |
+
<label for="maskFile" style="display: block; margin-bottom: 5px; font-weight: bold;">🎭 Footpath Mask:</label>
|
| 382 |
+
<input type="file" id="maskFile" name="mask" accept="image/*" required style="width: 100%;">
|
| 383 |
+
</div>
|
| 384 |
+
<button type="submit">Analyze Image with Mask</button>
|
| 385 |
</form>
|
| 386 |
|
| 387 |
<div id="results" class="results">
|
|
|
|
| 392 |
|
| 393 |
<div class="endpoint-info">
|
| 394 |
<h3>🔗 API Endpoints</h3>
|
| 395 |
+
<p><strong>POST /estimate-depth</strong> - Upload image and footpath mask for depth estimation</p>
|
| 396 |
<p><strong>GET /docs</strong> - API documentation</p>
|
| 397 |
<p><strong>GET /health</strong> - Health check</p>
|
| 398 |
</div>
|
|
|
|
| 401 |
<h3>✨ Features</h3>
|
| 402 |
<ul>
|
| 403 |
<li>🎯 Monocular depth estimation using Depth Pro</li>
|
| 404 |
+
<li>🎭 Footpath mask-based analysis</li>
|
| 405 |
+
<li>📏 Real-world distance calculation between mask boundaries</li>
|
| 406 |
<li>🖥️ CPU-optimized processing</li>
|
| 407 |
<li>🚀 Fast inference suitable for real-time use</li>
|
| 408 |
</ul>
|
|
|
|
| 414 |
e.preventDefault();
|
| 415 |
|
| 416 |
const fileInput = document.getElementById('imageFile');
|
| 417 |
+
const maskInput = document.getElementById('maskFile');
|
| 418 |
const resultsDiv = document.getElementById('results');
|
| 419 |
const resultsContent = document.getElementById('resultsContent');
|
| 420 |
|
| 421 |
if (!fileInput.files[0]) {
|
| 422 |
+
alert('Please select a main image file');
|
| 423 |
+
return;
|
| 424 |
+
}
|
| 425 |
+
|
| 426 |
+
if (!maskInput.files[0]) {
|
| 427 |
+
alert('Please select a footpath mask file');
|
| 428 |
return;
|
| 429 |
}
|
| 430 |
|
| 431 |
const formData = new FormData();
|
| 432 |
formData.append('file', fileInput.files[0]);
|
| 433 |
+
formData.append('mask', maskInput.files[0]);
|
| 434 |
|
| 435 |
try {
|
| 436 |
+
resultsContent.innerHTML = '<p>🔄 Processing image and mask...</p>';
|
| 437 |
resultsDiv.style.display = 'block';
|
| 438 |
|
| 439 |
const response = await fetch('/estimate-depth', {
|
|
|
|
| 448 |
html += `<p><strong>📐 Distance:</strong> ${result.distance_meters ? result.distance_meters.toFixed(3) + ' meters' : 'N/A'}</p>`;
|
| 449 |
html += `<p><strong>🎯 Focal Length:</strong> ${result.focal_length_px ? result.focal_length_px.toFixed(2) + ' pixels' : 'N/A'}</p>`;
|
| 450 |
html += `<p><strong>📊 Depth Map Shape:</strong> ${result.depth_map_shape ? result.depth_map_shape.join(' x ') : 'N/A'}</p>`;
|
| 451 |
+
html += `<p><strong>🔝 Top Mask Pixel:</strong> ${result.topmost_pixel ? `(${result.topmost_pixel[0]}, ${result.topmost_pixel[1]})` : 'N/A'}</p>`;
|
|
|
|
| 452 |
|
| 453 |
if (result.depth_stats) {
|
| 454 |
+
html += '<h4>📈 Depth Statistics:</h4>';
|
| 455 |
html += `<p><strong>Min Depth:</strong> ${result.depth_stats.min_depth.toFixed(3)}m</p>`;
|
| 456 |
html += `<p><strong>Max Depth:</strong> ${result.depth_stats.max_depth.toFixed(3)}m</p>`;
|
| 457 |
html += `<p><strong>Mean Depth:</strong> ${result.depth_stats.mean_depth.toFixed(3)}m</p>`;
|
|
|
|
| 472 |
"""
|
| 473 |
return HTMLResponse(content=html_content)
|
| 474 |
|
|
|
|
|
|
|
|
|
|
| 475 |
|
| 476 |
# FastAPI app is ready to run
|
| 477 |
if __name__ == "__main__":
|
depth_pro/utils.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ALL UTIL CREDITS TO DEPTH PRO TEAM
|
| 2 |
+
|
| 3 |
+
# Copyright (C) 2024 Apple Inc. All Rights Reserved.
|
| 4 |
+
|
| 5 |
+
import logging
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Any, Dict, List, Tuple, Union
|
| 8 |
+
|
| 9 |
+
import numpy as np
|
| 10 |
+
import pillow_heif
|
| 11 |
+
from PIL import ExifTags, Image, TiffTags
|
| 12 |
+
from pillow_heif import register_heif_opener
|
| 13 |
+
|
| 14 |
+
register_heif_opener()
|
| 15 |
+
LOGGER = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def extract_exif(img_pil: Image) -> Dict[str, Any]:
|
| 19 |
+
"""Return exif information as a dictionary.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
----
|
| 23 |
+
img_pil: A Pillow image.
|
| 24 |
+
|
| 25 |
+
Returns:
|
| 26 |
+
-------
|
| 27 |
+
A dictionary with extracted EXIF information.
|
| 28 |
+
|
| 29 |
+
"""
|
| 30 |
+
# Get full exif description from get_ifd(0x8769):
|
| 31 |
+
# cf https://pillow.readthedocs.io/en/stable/releasenotes/8.2.0.html#image-getexif-exif-and-gps-ifd
|
| 32 |
+
img_exif = img_pil.getexif().get_ifd(0x8769)
|
| 33 |
+
exif_dict = {ExifTags.TAGS[k]: v for k, v in img_exif.items() if k in ExifTags.TAGS}
|
| 34 |
+
|
| 35 |
+
tiff_tags = img_pil.getexif()
|
| 36 |
+
tiff_dict = {
|
| 37 |
+
TiffTags.TAGS_V2[k].name: v
|
| 38 |
+
for k, v in tiff_tags.items()
|
| 39 |
+
if k in TiffTags.TAGS_V2
|
| 40 |
+
}
|
| 41 |
+
return {**exif_dict, **tiff_dict}
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def fpx_from_f35(width: float, height: float, f_mm: float = 50) -> float:
|
| 45 |
+
"""Convert a focal length given in mm (35mm film equivalent) to pixels."""
|
| 46 |
+
return f_mm * np.sqrt(width**2.0 + height**2.0) / np.sqrt(36**2 + 24**2)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def load_rgb(
|
| 50 |
+
path: Union[Path, str], auto_rotate: bool = True, remove_alpha: bool = True
|
| 51 |
+
) -> Tuple[np.ndarray, List[bytes], float]:
|
| 52 |
+
"""Load an RGB image.
|
| 53 |
+
|
| 54 |
+
Args:
|
| 55 |
+
----
|
| 56 |
+
path: The url to the image to load.
|
| 57 |
+
auto_rotate: Rotate the image based on the EXIF data, default is True.
|
| 58 |
+
remove_alpha: Remove the alpha channel, default is True.
|
| 59 |
+
|
| 60 |
+
Returns:
|
| 61 |
+
-------
|
| 62 |
+
img: The image loaded as a numpy array.
|
| 63 |
+
icc_profile: The color profile of the image.
|
| 64 |
+
f_px: The optional focal length in pixels, extracting from the exif data.
|
| 65 |
+
|
| 66 |
+
"""
|
| 67 |
+
LOGGER.debug(f"Loading image {path} ...")
|
| 68 |
+
|
| 69 |
+
path = Path(path)
|
| 70 |
+
if path.suffix.lower() in [".heic"]:
|
| 71 |
+
heif_file = pillow_heif.open_heif(path, convert_hdr_to_8bit=True)
|
| 72 |
+
img_pil = heif_file.to_pillow()
|
| 73 |
+
else:
|
| 74 |
+
img_pil = Image.open(path)
|
| 75 |
+
|
| 76 |
+
img_exif = extract_exif(img_pil)
|
| 77 |
+
icc_profile = img_pil.info.get("icc_profile", None)
|
| 78 |
+
|
| 79 |
+
# Rotate the image.
|
| 80 |
+
if auto_rotate:
|
| 81 |
+
exif_orientation = img_exif.get("Orientation", 1)
|
| 82 |
+
if exif_orientation == 3:
|
| 83 |
+
img_pil = img_pil.transpose(Image.ROTATE_180)
|
| 84 |
+
elif exif_orientation == 6:
|
| 85 |
+
img_pil = img_pil.transpose(Image.ROTATE_270)
|
| 86 |
+
elif exif_orientation == 8:
|
| 87 |
+
img_pil = img_pil.transpose(Image.ROTATE_90)
|
| 88 |
+
elif exif_orientation != 1:
|
| 89 |
+
LOGGER.warning(f"Ignoring image orientation {exif_orientation}.")
|
| 90 |
+
|
| 91 |
+
img = np.array(img_pil)
|
| 92 |
+
# Convert to RGB if single channel.
|
| 93 |
+
if img.ndim < 3 or img.shape[2] == 1:
|
| 94 |
+
img = np.dstack((img, img, img))
|
| 95 |
+
|
| 96 |
+
if remove_alpha:
|
| 97 |
+
img = img[:, :, :3]
|
| 98 |
+
|
| 99 |
+
LOGGER.debug(f"\tHxW: {img.shape[0]}x{img.shape[1]}")
|
| 100 |
+
|
| 101 |
+
# Extract the focal length from exif data.
|
| 102 |
+
f_35mm = img_exif.get(
|
| 103 |
+
"FocalLengthIn35mmFilm",
|
| 104 |
+
img_exif.get(
|
| 105 |
+
"FocalLenIn35mmFilm", img_exif.get("FocalLengthIn35mmFormat", None)
|
| 106 |
+
),
|
| 107 |
+
)
|
| 108 |
+
if f_35mm is not None and f_35mm > 0:
|
| 109 |
+
LOGGER.debug(f"\tfocal length @ 35mm film: {f_35mm}mm")
|
| 110 |
+
f_px = fpx_from_f35(img.shape[1], img.shape[0], f_35mm)
|
| 111 |
+
else:
|
| 112 |
+
f_px = None
|
| 113 |
+
|
| 114 |
+
return img, icc_profile, f_px
|
requirements.txt
CHANGED
|
@@ -7,4 +7,7 @@ numpy
|
|
| 7 |
huggingface-hub
|
| 8 |
requests
|
| 9 |
python-multipart
|
| 10 |
-
accelerate
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
huggingface-hub
|
| 8 |
requests
|
| 9 |
python-multipart
|
| 10 |
+
accelerate
|
| 11 |
+
torch
|
| 12 |
+
torchvision
|
| 13 |
+
pillow_heif
|