Spaces:

Saiky2k
/

DepthPro_CVProject

Sleeping

App Files Files Community

Saiky2k commited on Mar 21

Commit

2cdc70d

verified ·

1 Parent(s): 7e48aad

Update app.py

Browse files

Files changed (1) hide show

app.py +389 -328

app.py CHANGED Viewed

@@ -4,6 +4,8 @@ from PIL import Image
 import cv2
 import numpy as np
 import torch
 import tempfile
 import os
 import requests
@@ -12,397 +14,456 @@ from io import BytesIO
 # Cấu hình trang
 st.set_page_config(page_title="Phát hiện người và độ sâu", layout="wide")
-# Tạo module độ sâu đơn giản
-class DepthEstimator:
-    def __init__(self):
-        self.model = None
-        self.processor = None
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    def load_model(self):
-        if self.model is None:
-            from transformers import AutoImageProcessor, AutoModelForDepthEstimation
-            self.processor = AutoImageProcessor.from_pretrained("vinvino02/glpn-nyu")
-            self.model = AutoModelForDepthEstimation.from_pretrained("vinvino02/glpn-nyu")
-            self.model.to(self.device)
-            self.model.eval()
-        return self.model, self.processor
-    def predict_depth(self, image):
-        model, processor = self.load_model()
-        # Chuẩn bị đầu vào
-        if isinstance(image, np.ndarray):
-            # Chuyển từ OpenCV (BGR) sang RGB
-            if image.shape[2] == 3:
-                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-            pil_image = Image.fromarray(image)
-        else:
-            pil_image = image
-        inputs = processor(images=pil_image, return_tensors="pt").to(self.device)
-        # Dự đoán độ sâu
-        with torch.no_grad():
-            outputs = model(**inputs)
-            predicted_depth = outputs.predicted_depth
-        # Chuẩn hóa độ sâu để hiển thị tốt hơn
-        depth_min = torch.min(predicted_depth)
-        depth_max = torch.max(predicted_depth)
-        normalized_depth = (predicted_depth - depth_min) / (depth_max - depth_min)
-        normalized_depth = normalized_depth * 10  # Nhân với 10 để có giá trị mét hợp lý hơn
-        # Chuyển đổi sang mảng numpy
-        depth_map = normalized_depth.squeeze().cpu().numpy()
-        return depth_map
-# Tải và cache mô hình YOLO
 @st.cache_resource
 def load_yolo_model():
-    from ultralytics import YOLO
     model = YOLO("yolov8n.pt")
     return model
-# Phát hiện người trong ảnh
-def detect_people(image, confidence_threshold=0.5):
-    yolo_model = load_yolo_model()
-    results = yolo_model(image, conf=confidence_threshold)
-    person_boxes = []
-    for result in results:
-        boxes = result.boxes.xyxy.cpu().numpy()
-        classes = result.boxes.cls.cpu().numpy()
-        confs = result.boxes.conf.cpu().numpy()
-        for box, cls, conf in zip(boxes, classes, confs):
-            if result.names[int(cls)] == "person" and conf > confidence_threshold:
-                x1, y1, x2, y2 = map(int, box[:4])
-                person_boxes.append((x1, y1, x2, y2, conf))
-    return person_boxes
-# Xử lý ảnh
-def process_image(image, confidence=0.5):
-    # Tạo bản sao của ảnh để vẽ lên
-    display_image = image.copy()
-    # Phát hiện người
-    person_boxes = detect_people(image, confidence)
-    # Ước tính độ sâu
-    depth_estimator = DepthEstimator()
-    depth_map = depth_estimator.predict_depth(image)
-    # Tạo bản đồ màu độ sâu
-    depth_colormap = create_depth_colormap(depth_map)
-    # Vẽ khung giới hạn và thông tin độ sâu
-    for x1, y1, x2, y2, conf in person_boxes:
-        # Vẽ khung giới hạn
-        cv2.rectangle(display_image, (x1, y1), (x2, y2), (0, 255, 0), 2)
-        # Tính toán độ sâu tại vị trí trung tâm
-        center_x = (x1 + x2) // 2
-        center_y = (y1 + y2) // 2
-        # Đảm bảo tọa độ nằm trong giới hạn
-        center_x = min(center_x, depth_map.shape[1] - 1) if center_x < depth_map.shape[1] else depth_map.shape[1] // 2
-        center_y = min(center_y, depth_map.shape[0] - 1) if center_y < depth_map.shape[0] else depth_map.shape[0] // 2
-        depth_value = depth_map[center_y, center_x]
-        # Vẽ nhãn độ sâu
-        text = f"Độ sâu: {depth_value:.2f}m ({conf:.2f})"
-        draw_label(display_image, text, (x1, y1))
-    return display_image, depth_colormap, len(person_boxes)
-# Xử lý video
-def process_video(video_path, confidence=0.5, progress_bar=None, progress_text=None):
-    # Mở video
-    cap = cv2.VideoCapture(video_path)
-    # Lấy thuộc tính video
-    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    fps = cap.get(cv2.CAP_PROP_FPS)
-    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-    # Tạo tệp đầu ra
-    temp_output_dir = tempfile.mkdtemp()
-    output_video_path = os.path.join(temp_output_dir, "detection_depth.mp4")
-    # Thiết lập writer
-    fourcc = cv2.VideoWriter_fourcc(*'XVID')
-    out = cv2.VideoWriter(output_video_path, fourcc, fps, (width * 2, height))
-    # Đối tượng phát hiện và ước tính độ sâu
-    depth_estimator = DepthEstimator()
-    # Biến đếm
-    frame_counter = 0
-    person_count = 0
-    # Tạo cột để hiển thị khung hình
-    preview_col1, preview_col2 = st.columns(2)
-    detection_placeholder = preview_col1.empty()
-    depth_placeholder = preview_col2.empty()
-    try:
-        while cap.isOpened():
-            ret, frame = cap.read()
-            if not ret:
-                break
-            frame_counter += 1
-            # Cập nhật tiến trình
-            if progress_bar:
-                progress = int(frame_counter / total_frames * 100)
-                progress_bar.progress(progress)
-            if frame_counter % 10 == 0 and progress_text:
-                progress_text.text(f"Đang xử lý: {frame_counter}/{total_frames} khung hình")
             # Phát hiện người
-            person_boxes = detect_people(frame, confidence)
-            person_count += len(person_boxes)
-            # Ước tính độ sâu (chỉ xử lý mỗi 5 khung hình để tăng tốc độ)
-            if frame_counter % 5 == 0 or frame_counter == 1:
-                depth_map = depth_estimator.predict_depth(frame)
             # Tạo bản đồ màu độ sâu
-            depth_colormap = create_depth_colormap(depth_map)
-            # Vẽ khung giới hạn và thông tin độ sâu
-            for x1, y1, x2, y2, conf in person_boxes:
-                # Vẽ khung giới hạn
-                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
-                # Tính toán độ sâu tại vị trí trung tâm
-                center_x = (x1 + x2) // 2
-                center_y = (y1 + y2) // 2
-                # Đảm bảo tọa độ nằm trong giới hạn
-                center_x = min(center_x, depth_map.shape[1] - 1) if center_x < depth_map.shape[1] else depth_map.shape[1] // 2
-                center_y = min(center_y, depth_map.shape[0] - 1) if center_y < depth_map.shape[0] else depth_map.shape[0] // 2
-                depth_value = depth_map[center_y, center_x]
-                # Vẽ nhãn độ sâu
-                text = f"Độ sâu: {depth_value:.2f}m ({conf:.2f})"
-                draw_label(frame, text, (x1, y1))
-            # Ghép hai khung hình lại với nhau
-            combined_frame = np.hstack((frame, cv2.cvtColor(depth_colormap, cv2.COLOR_RGB2BGR)))
-            # Ghi khung hình
-            out.write(combined_frame)
-            # Hiển thị khung hình trong Streamlit (cập nhật mỗi 5 khung hình)
-            if frame_counter % 5 == 0:
-                detection_placeholder.image(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), caption="Phát hiện người", use_column_width=True)
-                depth_placeholder.image(depth_colormap, caption="Bản đồ độ sâu", use_column_width=True)
-    finally:
-        # Giải phóng tài nguyên
-        cap.release()
-        out.release()
-        # Tính trung bình số người phát hiện được
-        avg_persons = person_count / frame_counter if frame_counter > 0 else 0
-        return output_video_path, avg_persons
-# Hàm tiện ích
-def create_depth_colormap(depth_map):
-    # Chuẩn hóa độ sâu từ 0-1
-    normalized = (depth_map - np.min(depth_map)) / (np.max(depth_map) - np.min(depth_map))
-    # Đảo ngược (gần = màu ấm, xa = màu lạnh)
-    inv_depth = 1 - normalized
-    # Chuyển đổi sang bản đồ màu
-    colored = cv2.applyColorMap((inv_depth * 255).astype(np.uint8), cv2.COLORMAP_TURBO)
-    # Chuyển đổi từ BGR sang RGB
-    return cv2.cvtColor(colored, cv2.COLOR_BGR2RGB)
-def draw_label(image, text, position):
-    font = cv2.FONT_HERSHEY_SIMPLEX
-    font_scale = 0.7
-    font_thickness = 2
-    text_size = cv2.getTextSize(text, font, font_scale, font_thickness)[0]
-    x, y = position
-    text_x = x
-    text_y = y - 10
-    rect_x1 = text_x - 5
-    rect_y1 = text_y - text_size[1] - 5
-    rect_x2 = text_x + text_size[0] + 5
-    rect_y2 = text_y + 5
-    cv2.rectangle(image, (rect_x1, rect_y1), (rect_x2, rect_y2), (0, 255, 0), -1)
-    cv2.putText(image, text, (text_x, text_y), font, font_scale, (0, 0, 0), font_thickness)
-# Giao diện người dùng chính
-def main():
-    st.title("Phát hiện người và Ước tính độ sâu")
-    # Sidebar với tùy chọn
-    st.sidebar.header("Tùy chọn")
-    confidence = st.sidebar.slider("Ngưỡng tin cậy", 0.0, 1.0, 0.5)
-    # Chọn chế độ: Ảnh hoặc Video
-    mode = st.sidebar.radio("Chế độ", ["Ảnh", "Video"])
-    # Chọn nguồn: Tải lên hoặc Mẫu
-    source = st.sidebar.radio("Nguồn", ["Tải lên", "Mẫu"])
-    if mode == "Ảnh":
-        if source == "Tải lên":
-            uploaded_file = st.file_uploader("Tải lên ảnh", type=['jpg', 'jpeg', 'png'])
-            if uploaded_file is not None:
-                image = Image.open(uploaded_file)
-                image = np.array(image)
-                # Chuyển đổi sang RGB nếu là RGBA
-                if image.shape[2] == 4:
-                    image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
-                st.image(image, caption="Ảnh đã tải lên", use_column_width=True)
-                if st.button("Xử lý Ảnh"):
-                    with st.spinner("Đang xử lý ảnh..."):
-                        result_image, depth_colormap, person_count = process_image(image, confidence)
-                    st.success(f"Phát hiện {person_count} người trong ảnh")
-                    col1, col2 = st.columns(2)
-                    col1.image(result_image, caption="Kết quả phát hiện", use_column_width=True)
-                    col2.image(depth_colormap, caption="Bản đồ độ sâu", use_column_width=True)
-        else:
-            # Sử dụng ảnh mẫu
-            st.info("Đang sử dụng ảnh mẫu...")
-            sample_img_url = "https://storage.googleapis.com/sfr-vision-language-research/DINO/ground_truth_images/000000014439.jpg"
-            try:
-                response = requests.get(sample_img_url)
-                image = Image.open(BytesIO(response.content))
-                image = np.array(image)
-                st.image(image, caption="Ảnh mẫu", use_column_width=True)
-                if st.button("Xử lý Ảnh"):
-                    with st.spinner("Đang xử lý ảnh..."):
-                        result_image, depth_colormap, person_count = process_image(image, confidence)
-                    st.success(f"Phát hiện {person_count} người trong ảnh")
-                    col1, col2 = st.columns(2)
-                    col1.image(result_image, caption="Kết quả phát hiện", use_column_width=True)
-                    col2.image(depth_colormap, caption="Bản đồ độ sâu", use_column_width=True)
-            except Exception as e:
-                st.error(f"Không thể tải ảnh mẫu: {e}")
-    else:
-        # Chế độ Video
-        if source == "Tải lên":
-            uploaded_file = st.file_uploader("Tải lên video", type=['mp4', 'avi', 'mov'])
-            if uploaded_file is not None:
-                # Lưu tệp đã tải lên vào thư mục tạm thời
-                temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
-                temp_file.write(uploaded_file.read())
-                video_path = temp_file.name
-                temp_file.close()
-                st.video(video_path)
-                if st.button("Xử lý Video"):
-                    progress_bar = st.progress(0)
-                    progress_text = st.empty()
-                    with st.spinner("Đang xử lý video..."):
-                        output_path, avg_persons = process_video(video_path, confidence, progress_bar, progress_text)
-                    st.success(f"Xử lý video hoàn tất! Trung bình phát hiện {avg_persons:.1f} người/khung hình")
-                    st.video(output_path)
-                    # Nút tải xuống
-                    with open(output_path, 'rb') as file:
-                        st.download_button(
-                            label="Tải xuống video kết quả",
-                            data=file,
-                            file_name="detection_depth_result.mp4",
-                            mime="video/mp4"
-                        )
-                    # Xóa tệp tạm thời
-                    os.unlink(video_path)
-        else:
-            # Sử dụng video mẫu
-            st.info("Đang sử dụng video mẫu...")
-            sample_video_url = "https://huggingface.co/spaces/Nupoor/SampleVideoDataset/resolve/main/pexels-richard-de-souza-1635985.mp4"
-            try:
-                response = requests.get(sample_video_url)
-                temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
-                temp_file.write(response.content)
-                video_path = temp_file.name
-                temp_file.close()
-                st.video(video_path)
-                if st.button("Xử lý Video"):
-                    progress_bar = st.progress(0)
-                    progress_text = st.empty()
-                    with st.spinner("Đang xử lý video..."):
-                        output_path, avg_persons = process_video(video_path, confidence, progress_bar, progress_text)
-                    st.success(f"Xử lý video hoàn t��t! Trung bình phát hiện {avg_persons:.1f} người/khung hình")
-                    st.video(output_path)
-                    # Nút tải xuống
-                    with open(output_path, 'rb') as file:
-                        st.download_button(
-                            label="Tải xuống video kết quả",
-                            data=file,
-                            file_name="detection_depth_result.mp4",
-                            mime="video/mp4"
-                        )
-                    # Xóa tệp tạm thời
-                    os.unlink(video_path)
-            except Exception as e:
-                st.error(f"Không thể tải video mẫu: {e}")
-    # Thông tin
     st.sidebar.header("Thông tin")
-    st.sidebar.markdown("""
     **Mô hình sử dụng:**
     - Phát hiện người: YOLOv8n
-    - Ước tính độ sâu: GLPN-NYU
-    **Cách sử dụng:**
-    1. Chọn chế độ (Ảnh/Video)
-    2. Chọn nguồn (Tải lên/Mẫu)
-    3. Điều chỉnh ngưỡng tin cậy
-    4. Nhấn nút xử lý
     """)
-# Thiết lập requirements.txt
 def create_requirements():
     return """
-    streamlit
-    numpy
-    Pillow
-    opencv-python
-    torch
-    torchvision
-    transformers
-    ultralytics
-    requests
-    opencv-python
-    """
 if __name__ == "__main__":
     main()

 import cv2
 import numpy as np
 import torch
+from ultralytics import YOLO
+import time
 import tempfile
 import os
 import requests
 # Cấu hình trang
 st.set_page_config(page_title="Phát hiện người và độ sâu", layout="wide")
+# Giả lập module depth_pro
+class Depth_pro:
+    @staticmethod
+    def create_model_and_transforms():
+        # Import thư viện cần thiết
+        import torch
+        from transformers import AutoImageProcessor, AutoModelForDepthEstimation
+        # Tải processor và model
+        processor = AutoImageProcessor.from_pretrained("vinvino02/glpn-nyu")
+        model = AutoModelForDepthEstimation.from_pretrained("vinvino02/glpn-nyu")
+        # Thiết lập model và đưa về chế độ đánh giá
+        model.eval()
+        # Hàm transform cho ảnh đầu vào
+        def transform(image):
+            return processor(images=image, return_tensors="pt").pixel_values
+        # Mở rộng model với phương thức infer (tương thích với mã ban đầu)
+        def infer_method(self, image, f_px=None):
+            with torch.no_grad():
+                outputs = self(image)
+                predicted_depth = outputs.predicted_depth
+            # Chuẩn hóa độ sâu
+            depth_min = torch.min(predicted_depth)
+            depth_max = torch.max(predicted_depth)
+            predicted_depth = (predicted_depth - depth_min) / (depth_max - depth_min)
+            predicted_depth = predicted_depth * 10  # Nhân với 10 để có giá trị mét hợp lý
+            return {"depth": predicted_depth}
+        # Gắn phương thức infer vào model
+        model.infer = infer_method.__get__(model)
+        return model, transform
+# Tải model YOLO và cache
 @st.cache_resource
 def load_yolo_model():
     model = YOLO("yolov8n.pt")
     return model
+# Tạo bản đồ màu từ ảnh độ sâu
+def create_depth_colormap(depth_map):
+    # Chuẩn hóa độ sâu
+    depth_np_normalized = (depth_map - np.min(depth_map)) / (np.max(depth_map) - np.min(depth_map))
+    inv_depth_np_normalized = 1 - depth_np_normalized  # Đảo ngược (gần = sáng, xa = tối)
+    # Chuyển đổi sang bản đồ màu
+    depth_colormap = cv2.applyColorMap((inv_depth_np_normalized * 255).astype(np.uint8), cv2.COLORMAP_TURBO)
+    # Chuyển đổi từ BGR sang RGB
+    depth_colormap_rgb = cv2.cvtColor(depth_colormap, cv2.COLOR_BGR2RGB)
+    return depth_colormap_rgb
+# Vẽ nhãn trên ảnh
+def draw_depth_label(image, text, position):
+    x1, y1 = position
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    font_scale = 0.7
+    font_thickness = 2
+    text_size = cv2.getTextSize(text, font, font_scale, font_thickness)[0]
+    # Vẽ hình chữ nhật nền
+    text_x = x1
+    text_y = y1 - 10
+    rect_x1 = text_x - 5
+    rect_y1 = text_y - text_size[1] - 5
+    rect_x2 = text_x + text_size[0] + 5
+    rect_y2 = text_y + 5
+    cv2.rectangle(image, (rect_x1, rect_y1), (rect_x2, rect_y2), (0, 255, 0), -1)
+    # Vẽ văn bản
+    cv2.putText(image, text, (text_x, text_y), font, font_scale, (0, 0, 0), font_thickness)
+# Chức năng xử lý ảnh
+def process_image():
+    st.header("Phát hiện người và Ước tính độ sâu trong Ảnh")
+    # Tùy chọn cho ảnh
+    upload_option = st.radio("Chọn nguồn ảnh:", ["Tải lên ảnh", "Sử dụng ảnh mẫu"])
+    image = None
+    if upload_option == "Tải lên ảnh":
+        uploaded_image = st.file_uploader("Tải lên ảnh", type=["jpg", "jpeg", "png"])
+        if uploaded_image is not None:
+            image = Image.open(uploaded_image)
+            image_np = np.array(image)
+            # Hiển thị ảnh gốc
+            st.image(image_np, caption="Ảnh đã tải lên", use_column_width=True)
+    else:
+        # Sử dụng ảnh mẫu
+        sample_img_url = "https://storage.googleapis.com/sfr-vision-language-research/DINO/ground_truth_images/000000014439.jpg"
+        try:
+            response = requests.get(sample_img_url)
+            image = Image.open(BytesIO(response.content))
+            image_np = np.array(image)
+            # Hiển thị ảnh mẫu
+            st.image(image_np, caption="Ảnh mẫu", use_column_width=True)
+        except Exception as e:
+            st.error(f"Không thể tải ảnh mẫu: {e}")
+    # Ngưỡng tin cậy cho phát hiện
+    confidence = st.slider("Ngưỡng tin cậy:", 0.0, 1.0, 0.5, 0.05)
+    # Chỉ tiếp tục nếu có ảnh
+    if image is not None and st.button("Xử lý Ảnh"):
+        with st.spinner("Đang xử lý ảnh..."):
+            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+            st.info(f"Đang sử dụng thiết bị: {device}")
+            # Tải models
+            yolo_model = load_yolo_model()
+            depth_pro = Depth_pro()
+            depth_model, transform = depth_pro.create_model_and_transforms()
+            if device.type == 'cuda':
+                depth_model.to(device)
             # Phát hiện người
+            results = yolo_model(image_np, conf=confidence)
+            # Chuẩn bị ảnh để vẽ kết quả
+            output_image = image_np.copy()
+            # Chuẩn bị đầu vào cho model độ sâu
+            if len(image_np.shape) == 3 and image_np.shape[2] == 3:
+                if image_np.dtype == np.uint8:
+                    rgb_image = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB) if image_np.shape[2] == 3 else image_np
+                    pil_image = Image.fromarray(rgb_image)
+                else:
+                    pil_image = image
+            else:
+                pil_image = image
+            # Chuyển đổi ảnh cho model độ sâu
+            depth_input = transform(pil_image)
+            if device.type == 'cuda':
+                depth_input = depth_input.to(device)
+            # Ước tính độ sâu
+            focal_length_px = torch.tensor([max(image_np.shape[1], image_np.shape[0])], device=device)
+            with torch.no_grad():
+                predictions = depth_model.infer(depth_input, f_px=focal_length_px)
+                depth = predictions["depth"]
+            # Chuyển tensor sang numpy
+            depth_np = depth.squeeze().cpu().numpy()
+            # Điều chỉnh kích thước bản đồ độ sâu nếu cần
+            if depth_np.shape[:2] != image_np.shape[:2]:
+                depth_np = cv2.resize(depth_np, (image_np.shape[1], image_np.shape[0]), interpolation=cv2.INTER_LINEAR)
             # Tạo bản đồ màu độ sâu
+            depth_colormap = create_depth_colormap(depth_np)
+            # Đếm số người phát hiện được
+            person_count = 0
+            # Xử lý kết quả YOLO
+            for result in results:
+                boxes = result.boxes.xyxy.cpu().numpy()
+                classes = result.boxes.cls.cpu().numpy()
+                confs = result.boxes.conf.cpu().numpy()
+                for box, cls, conf in zip(boxes, classes, confs):
+                    if result.names[int(cls)] == "person" and conf > confidence:
+                        person_count += 1
+                        x1, y1, x2, y2 = map(int, box[:4])
+                        # Vẽ khung giới hạn
+                        cv2.rectangle(output_image, (x1, y1), (x2, y2), (0, 255, 0), 2)
+                        # Tính độ sâu ở vị trí trung tâm
+                        center_x = (x1 + x2) // 2
+                        center_y = (y1 + y2) // 2
+                        # Đảm bảo tọa độ trong giới hạn
+                        center_x = min(center_x, depth_np.shape[1] - 1)
+                        center_y = min(center_y, depth_np.shape[0] - 1)
+                        depth_value = depth_np[center_y, center_x]
+                        # Vẽ thông tin độ sâu
+                        text = f"Độ sâu: {depth_value:.2f}m"
+                        draw_depth_label(output_image, text, (x1, y1))
+            # Hiển thị kết quả
+            st.success(f"Đã phát hiện {person_count} người trong ảnh")
+            col1, col2 = st.columns(2)
+            col1.image(output_image, caption="Phát hiện người với độ sâu", use_column_width=True)
+            col2.image(depth_colormap, caption="Bản đồ độ sâu", use_column_width=True)
+# Chức năng xử lý video
+def process_video():
+    st.header("Phát hiện người và Ước tính độ sâu trong Video")
+    # Tùy chọn cho video
+    upload_option = st.radio("Chọn nguồn video:", ["Tải lên video", "Sử dụng video mẫu"])
+    video_path = None
+    if upload_option == "Tải lên video":
+        uploaded_video = st.file_uploader("Tải lên video", type=["mp4", "avi", "mov"])
+        if uploaded_video is not None:
+            # Lưu video tải lên vào tệp tạm thời
+            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
+            temp_file.write(uploaded_video.read())
+            video_path = temp_file.name
+            temp_file.close()
+            # Hiển thị video gốc
+            st.video(video_path)
+    else:
+        # Sử dụng video mẫu
+        sample_video_url = "https://huggingface.co/spaces/Nupoor/SampleVideoDataset/resolve/main/pexels-richard-de-souza-1635985.mp4"
+        try:
+            # Tải video mẫu
+            response = requests.get(sample_video_url)
+            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
+            temp_file.write(response.content)
+            video_path = temp_file.name
+            temp_file.close()
+            # Hiển thị video mẫu
+            st.video(video_path)
+        except Exception as e:
+            st.error(f"Không thể tải video mẫu: {e}")
+    # Ngưỡng tin cậy cho phát hiện
+    confidence = st.slider("Ngưỡng tin cậy:", 0.0, 1.0, 0.5, 0.05)
+    # Mỗi bao nhiêu khung hình thì cập nhật độ sâu
+    depth_update_interval = st.slider("Cập nhật độ sâu mỗi (số khung hình):", 1, 10, 5)
+    # Chỉ tiếp tục nếu có video
+    if video_path is not None and st.button("Xử lý Video"):
+        # Hiển thị thanh tiến trình
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        st.info(f"Đang sử dụng thiết bị: {device}")
+        # Tải models
+        with st.spinner('Đang tải mô hình YOLO...'):
+            yolo_model = load_yolo_model()
+            if device.type == 'cuda':
+                yolo_model.to(device)
+        with st.spinner('Đang tải mô hình độ sâu...'):
+            depth_pro = Depth_pro()
+            depth_model, transform = depth_pro.create_model_and_transforms()
+            if device.type == 'cuda':
+                depth_model.to(device)
+        # Mở video
+        cap = cv2.VideoCapture(video_path)
+        # Lấy thông tin video
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        # Tạo tệp đầu ra
+        temp_output_dir = tempfile.mkdtemp()
+        output_video_path = os.path.join(temp_output_dir, "person_detection_with_depth.mp4")
+        output_depth_path = os.path.join(temp_output_dir, "depth_colormap.mp4")
+        # Thiết lập writers
+        fourcc = cv2.VideoWriter_fourcc(*'XVID')
+        out_detection = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
+        out_depth = cv2.VideoWriter(output_depth_path, fourcc, fps, (width, height))
+        # Ước tính chiều dài tiêu cự
+        focal_length_px = torch.tensor([max(width, height)], device=device)
+        # Cột hiển thị khung hình đang xử lý
+        preview_col1, preview_col2 = st.columns(2)
+        detection_placeholder = preview_col1.empty()
+        depth_placeholder = preview_col2.empty()
+        frame_counter = 0
+        start_time = time.time()
+        depth_np = None
+        try:
+            while cap.isOpened():
+                ret, frame = cap.read()
+                if not ret:
+                    break
+                frame_counter += 1
+                # Cập nhật tiến trình
+                progress = int(frame_counter / total_frames * 100)
+                progress_bar.progress(progress)
+                if frame_counter % 10 == 0:
+                    elapsed_time = time.time() - start_time
+                    frames_left = total_frames - frame_counter
+                    est_time_left = (elapsed_time / frame_counter) * frames_left if frame_counter > 0 else 0
+                    status_text.text(f"Đang xử lý khung hình {frame_counter}/{total_frames} - Thời gian còn lại: {est_time_left:.2f}s")
+                # Phát hiện người với YOLO
+                results = yolo_model(frame, conf=confidence)
+                person_boxes = []
+                for result in results:
+                    boxes = result.boxes.xyxy.cpu().numpy()
+                    classes = result.boxes.cls.cpu().numpy()
+                    confs = result.boxes.conf.cpu().numpy()
+                    for box, cls, conf in zip(boxes, classes, confs):
+                        if result.names[int(cls)] == "person" and conf > confidence:
+                            x1, y1, x2, y2 = map(int, box[:4])
+                            person_boxes.append((x1, y1, x2, y2))
+                            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
+                # Cập nhật độ sâu theo khoảng đã thiết lập
+                if frame_counter % depth_update_interval == 0 or frame_counter == 1 or depth_np is None:
+                    # Chuyển đổi khung hình cho model độ sâu
+                    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    pil_image = Image.fromarray(rgb_frame)
+                    depth_input = transform(pil_image)
+                    if device.type == 'cuda':
+                        depth_input = depth_input.to(device)
+                    # Ước tính độ sâu
+                    with torch.no_grad():
+                        predictions = depth_model.infer(depth_input, f_px=focal_length_px)
+                        depth = predictions["depth"]
+                    depth_np = depth.squeeze().cpu().numpy()
+                    # Điều chỉnh kích thước nếu cần
+                    if depth_np.shape[:2] != (height, width):
+                        depth_np = cv2.resize(depth_np, (width, height), interpolation=cv2.INTER_LINEAR)
+                # Tạo bản đồ màu độ sâu
+                depth_colormap = create_depth_colormap(depth_np)
+                # Thêm thông tin độ sâu cho người đã phát hiện
+                for x1, y1, x2, y2 in person_boxes:
+                    center_x = (x1 + x2) // 2
+                    center_y = (y1 + y2) // 2
+                    # Đảm bảo tọa độ trong giới hạn
+                    center_x = min(center_x, depth_np.shape[1] - 1)
+                    center_y = min(center_y, depth_np.shape[0] - 1)
+                    depth_value = depth_np[center_y, center_x]
+                    # Vẽ thông tin độ sâu
+                    text = f"Độ sâu: {depth_value:.2f}m"
+                    draw_depth_label(frame, text, (x1, y1))
+                # Hiển thị khung hình trong Streamlit (cập nhật theo khoảng)
+                if frame_counter % 5 == 0 or frame_counter == 1:
+                    detection_placeholder.image(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), caption="Phát hiện người", use_column_width=True)
+                    depth_placeholder.image(depth_colormap, caption="Bản đồ độ sâu", use_column_width=True)
+                # Ghi khung hình vào videos
+                out_detection.write(frame)
+                out_depth.write(cv2.cvtColor(depth_colormap, cv2.COLOR_RGB2BGR))
+        finally:
+            # Giải phóng tài nguyên
+            cap.release()
+            out_detection.release()
+            out_depth.release()
+            total_time = time.time() - start_time
+            st.success(f"Xử lý hoàn tất! Tổng thời gian: {total_time:.2f}s")
+            st.success(f"FPS trung bình: {frame_counter / total_time:.2f}")
+            # Hiển thị videos đã xử lý
+            st.subheader("Videos kết quả")
+            col1, col2 = st.columns(2)
+            with col1:
+                st.video(output_video_path)
+                st.download_button(
+                    label="Tải xuống video phát hiện",
+                    data=open(output_video_path, 'rb').read(),
+                    file_name="person_detection_with_depth.mp4",
+                    mime="video/mp4"
+                )
+            with col2:
+                st.video(output_depth_path)
+                st.download_button(
+                    label="Tải xuống bản đồ độ sâu",
+                    data=open(output_depth_path, 'rb').read(),
+                    file_name="depth_colormap.mp4",
+                    mime="video/mp4"
+                )
+            # Xóa tệp tạm thời
+            try:
+                os.unlink(video_path)
+            except:
+                pass
+# Giao diện chính
+def main():
+    st.title("Ứng dụng Phát hiện Người và Ước tính Độ sâu")
+    # Chọn chế độ xử lý
+    app_mode = st.sidebar.selectbox("Chọn chế độ:", ["Xử lý Ảnh", "Xử lý Video"])
+    # Hiển thị thông tin
     st.sidebar.header("Thông tin")
+    st.sidebar.info("""
     **Mô hình sử dụng:**
     - Phát hiện người: YOLOv8n
+    - Độ sâu: depth_pro (GLPN-NYU)
+    **Màu sắc trong bản đồ độ sâu:**
+    - Màu đỏ/vàng: Gần
+    - Màu xanh: Xa
     """)
+    # Chạy chức năng tương ứng
+    if app_mode == "Xử lý Ảnh":
+        process_image()
+    else:
+        process_video()
+# Tạo tệp requirements.txt cho Hugging Face Space
 def create_requirements():
     return """
+streamlit
+numpy
+Pillow
+opencv-python-headless
+torch
+torchvision
+transformers
+ultralytics
+requests
+"""
 if __name__ == "__main__":
     main()