lab-vyfdxey4 / utils.py
Gertie01's picture
Deploy Gradio app with multiple files
cb390da verified
import cv2
import numpy as np
from PIL import Image
import torch
from transformers import DPTForDepthEstimation, DPTImageProcessor
# Initialize Depth Estimator outside functions to avoid re-loading
# Use config.DEVICE and config.DTYPE for consistency
try:
from config import DEVICE, DTYPE
except ImportError:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
print(f"Loading Depth Estimator on {DEVICE} with {DTYPE}...")
depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas")
depth_estimator.to(DEVICE)
feature_extractor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas")
print("Depth Estimator loaded.")
def apply_canny(image: Image.Image) -> Image.Image:
"""
Applies Canny edge detection to a PIL Image.
"""
image_np = np.array(image)
# Convert to grayscale if not already
if len(image_np.shape) == 3 and image_np.shape[2] == 3:
image_np = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
# Apply Canny
image_edges = cv2.Canny(image_np, 100, 200) # You can adjust thresholds
# Convert back to 3-channel for ControlNet
image_edges = image_edges[:, :, None]
image_edges = np.concatenate([image_edges, image_edges, image_edges], axis=2)
return Image.fromarray(image_edges)
def apply_depth(image: Image.Image) -> Image.Image:
"""
Estimates depth from a PIL Image and returns a depth map image.
"""
original_size = image.size
# Resize image for depth estimation speed if it's very large, maintain aspect ratio
max_dim = max(original_size)
if max_dim > 768:
scale_factor = 768 / max_dim
image = image.resize((int(original_size[0] * scale_factor), int(original_size[1] * scale_factor)), Image.BICUBIC)
inputs = feature_extractor(images=image, return_tensors="pt").to(DEVICE)
with torch.no_grad():
outputs = depth_estimator(**inputs)
predicted_depth = outputs.predicted_depth
# Interpolate to original size and normalize
prediction = torch.nn.functional.interpolate(
predicted_depth.unsqueeze(1),
size=original_size[::-1], # PIL size is (width, height), interpolate expects (height, width)
mode="bicubic",
align_corners=False,
)
output = prediction.squeeze().cpu().numpy()
# Normalize to 0-255 and convert to uint8
formatted_output = np.interp(output, (output.min(), output.max()), (0, 255)).astype(np.uint8)
return Image.fromarray(formatted_output)