Spaces:

Saiky2k
/

DepthPro_CVProject

Running

App Files Files Community

Saiky2k commited on Mar 21

Commit

b92ea3d

verified ·

1 Parent(s): 606a3bb

Update app.py

Browse files

Files changed (1) hide show

app.py +221 -375

app.py CHANGED Viewed

@@ -11,459 +11,305 @@ import os
 import requests
 from io import BytesIO
-# Cấu hình trang
-st.set_page_config(page_title="Phát hiện người và độ sâu", layout="wide")
-# Giả lập module depth_pro
-class Depth_pro:
     @staticmethod
     def create_model_and_transforms():
-        # Import thư viện cần thiết
         import torch
         from transformers import AutoImageProcessor, AutoModelForDepthEstimation
-        # Tải processor và model
         processor = AutoImageProcessor.from_pretrained("vinvino02/glpn-nyu")
         model = AutoModelForDepthEstimation.from_pretrained("vinvino02/glpn-nyu")
-        # Thiết lập model và đưa về chế độ đánh giá
-        model.eval()
-        # Hàm transform cho ảnh đầu vào
         def transform(image):
             return processor(images=image, return_tensors="pt").pixel_values
-        # Mở rộng model với phương thức infer (tương thích với mã ban đầu)
         def infer_method(self, image, f_px=None):
             with torch.no_grad():
                 outputs = self(image)
                 predicted_depth = outputs.predicted_depth
             # Chuẩn hóa độ sâu
             depth_min = torch.min(predicted_depth)
-            depth_max = torch.max(predicted_depth)
             predicted_depth = (predicted_depth - depth_min) / (depth_max - depth_min)
-            predicted_depth = predicted_depth * 10  # Nhân với 10 để có giá trị mét hợp lý
             return {"depth": predicted_depth}
-        # Gắn phương thức infer vào model
         model.infer = infer_method.__get__(model)
         return model, transform
-# Tải model YOLO và cache
 @st.cache_resource
 def load_yolo_model():
     model = YOLO("yolov8n.pt")
     return model
-# Tạo bản đồ màu từ ảnh độ sâu
-def create_depth_colormap(depth_map):
-    # Chuẩn hóa độ sâu
-    depth_np_normalized = (depth_map - np.min(depth_map)) / (np.max(depth_map) - np.min(depth_map))
-    inv_depth_np_normalized = 1 - depth_np_normalized  # Đảo ngược (gần = sáng, xa = tối)
-    # Chuyển đổi sang bản đồ màu
-    depth_colormap = cv2.applyColorMap((inv_depth_np_normalized * 255).astype(np.uint8), cv2.COLORMAP_TURBO)
-    # Chuyển đổi từ BGR sang RGB
-    depth_colormap_rgb = cv2.cvtColor(depth_colormap, cv2.COLOR_BGR2RGB)
-    return depth_colormap_rgb
-# Vẽ nhãn trên ảnh
-def draw_depth_label(image, text, position):
-    x1, y1 = position
-    font = cv2.FONT_HERSHEY_SIMPLEX
-    font_scale = 0.7
-    font_thickness = 2
-    text_size = cv2.getTextSize(text, font, font_scale, font_thickness)[0]
-    # Vẽ hình chữ nhật nền
-    text_x = x1
-    text_y = y1 - 10
-    rect_x1 = text_x - 5
-    rect_y1 = text_y - text_size[1] - 5
-    rect_x2 = text_x + text_size[0] + 5
-    rect_y2 = text_y + 5
-    cv2.rectangle(image, (rect_x1, rect_y1), (rect_x2, rect_y2), (0, 255, 0), -1)
-    # Vẽ văn bản
-    cv2.putText(image, text, (text_x, text_y), font, font_scale, (0, 0, 0), font_thickness)
-# Chức năng xử lý ảnh
-def process_image():
-    st.header("Phát hiện người và Ước tính độ sâu trong Ảnh")
-    # Tùy chọn cho ảnh
-    upload_option = st.radio("Chọn nguồn ảnh:", ["Tải lên ảnh", "Sử dụng ảnh mẫu"])
-    image = None
-    if upload_option == "Tải lên ảnh":
-        uploaded_image = st.file_uploader("Tải lên ảnh", type=["jpg", "jpeg", "png"])
-        if uploaded_image is not None:
-            image = Image.open(uploaded_image)
-            image_np = np.array(image)
-            # Hiển thị ảnh gốc
-            st.image(image_np, caption="Ảnh đã tải lên", use_container_width=True)
-    else:
-        # Sử dụng ảnh mẫu
-        sample_img_url = "https://storage.googleapis.com/sfr-vision-language-research/DINO/ground_truth_images/000000014439.jpg"
-        try:
-            response = requests.get(sample_img_url)
-            image = Image.open(BytesIO(response.content))
-            image_np = np.array(image)
-            # Hiển thị ảnh mẫu
-            st.image(image_np, caption="Ảnh mẫu", use_container_width=True)
-        except Exception as e:
-            st.error(f"Không thể tải ảnh mẫu: {e}")
-    # Ngưỡng tin cậy cho phát hiện
-    confidence = st.slider("Ngưỡng tin cậy:", 0.0, 1.0, 0.5, 0.05)
-    # Chỉ tiếp tục nếu có ảnh
-    if image is not None and st.button("Xử lý Ảnh"):
-        with st.spinner("Đang xử lý ảnh..."):
-            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-            st.info(f"Đang sử dụng thiết bị: {device}")
-            # Tải models
-            yolo_model = load_yolo_model()
-            depth_pro = Depth_pro()
-            depth_model, transform = depth_pro.create_model_and_transforms()
-            if device.type == 'cuda':
-                depth_model.to(device)
-            # Phát hiện người
-            results = yolo_model(image_np, conf=confidence)
-            # Chuẩn bị ảnh để vẽ kết quả
-            output_image = image_np.copy()
-            # Chuẩn bị đầu vào cho model độ sâu
-            if len(image_np.shape) == 3 and image_np.shape[2] == 3:
-                if image_np.dtype == np.uint8:
-                    rgb_image = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB) if image_np.shape[2] == 3 else image_np
-                    pil_image = Image.fromarray(rgb_image)
-                else:
-                    pil_image = image
             else:
-                pil_image = image
-            # Chuyển đổi ảnh cho model độ sâu
             depth_input = transform(pil_image)
             if device.type == 'cuda':
                 depth_input = depth_input.to(device)
             # Ước tính độ sâu
-            focal_length_px = torch.tensor([max(image_np.shape[1], image_np.shape[0])], device=device)
             with torch.no_grad():
                 predictions = depth_model.infer(depth_input, f_px=focal_length_px)
-                depth = predictions["depth"]
-            # Chuyển tensor sang numpy
             depth_np = depth.squeeze().cpu().numpy()
-            # Điều chỉnh kích thước bản đồ độ sâu nếu cần
-            if depth_np.shape[:2] != image_np.shape[:2]:
-                depth_np = cv2.resize(depth_np, (image_np.shape[1], image_np.shape[0]), interpolation=cv2.INTER_LINEAR)
             # Tạo bản đồ màu độ sâu
-            depth_colormap = create_depth_colormap(depth_np)
-            # Đếm số người phát hiện được
-            person_count = 0
-            # Xử lý kết quả YOLO
-            for result in results:
-                boxes = result.boxes.xyxy.cpu().numpy()
-                classes = result.boxes.cls.cpu().numpy()
-                confs = result.boxes.conf.cpu().numpy()
-                for box, cls, conf in zip(boxes, classes, confs):
-                    if result.names[int(cls)] == "person" and conf > confidence:
-                        person_count += 1
-                        x1, y1, x2, y2 = map(int, box[:4])
-                        # Vẽ khung giới hạn
-                        cv2.rectangle(output_image, (x1, y1), (x2, y2), (0, 255, 0), 2)
-                        # Tính độ sâu ở vị trí trung tâm
-                        center_x = (x1 + x2) // 2
-                        center_y = (y1 + y2) // 2
-                        # Đảm bảo tọa độ trong giới hạn
-                        center_x = min(center_x, depth_np.shape[1] - 1)
-                        center_y = min(center_y, depth_np.shape[0] - 1)
-                        depth_value = depth_np[center_y, center_x]
-                        # Vẽ thông tin độ sâu
-                        text = f"Độ sâu: {depth_value:.2f}m"
-                        draw_depth_label(output_image, text, (x1, y1))
-            # Hiển thị kết quả
-            st.success(f"Đã phát hiện {person_count} người trong ảnh")
-            col1, col2 = st.columns(2)
-            col1.image(output_image, caption="Phát hiện người với độ sâu", use_container_width=True)
-            col2.image(depth_colormap, caption="Bản đồ độ sâu", use_container_width=True)
-# Chức năng xử lý video
-def process_video():
-    st.header("Phát hiện người và Ước tính độ sâu trong Video")
-    # Tùy chọn cho video
-    upload_option = st.radio("Chọn nguồn video:", ["Tải lên video", "Sử dụng video mẫu"])
     video_path = None
-    if upload_option == "Tải lên video":
-        uploaded_video = st.file_uploader("Tải lên video", type=["mp4", "avi", "mov"])
-        if uploaded_video is not None:
-            # Lưu video tải lên vào tệp tạm thời
-            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
-            temp_file.write(uploaded_video.read())
-            video_path = temp_file.name
-            temp_file.close()
-            # Hiển thị video gốc
-            st.video(video_path)
-    else:
-        # Sử dụng video mẫu
-        sample_video_url = "https://huggingface.co/spaces/Nupoor/SampleVideoDataset/resolve/main/pexels-richard-de-souza-1635985.mp4"
-        try:
-            # Tải video mẫu
-            response = requests.get(sample_video_url)
             temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
-            temp_file.write(response.content)
             video_path = temp_file.name
             temp_file.close()
-            # Hiển thị video mẫu
             st.video(video_path)
-        except Exception as e:
-            st.error(f"Không thể tải video mẫu: {e}")
-    # Ngưỡng tin cậy cho phát hiện
-    confidence = st.slider("Ngưỡng tin cậy:", 0.0, 1.0, 0.5, 0.05)
-    # Mỗi bao nhiêu khung hình thì cập nhật độ sâu
-    depth_update_interval = st.slider("Cập nhật độ sâu mỗi (số khung hình):", 1, 10, 5)
-    # Chỉ tiếp tục nếu có video
-    if video_path is not None and st.button("Xử lý Video"):
-        # Hiển thị thanh tiến trình
-        progress_bar = st.progress(0)
-        status_text = st.empty()
-        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        st.info(f"Đang sử dụng thiết bị: {device}")
-        # Tải models
-        with st.spinner('Đang tải mô hình YOLO...'):
-            yolo_model = load_yolo_model()
-            if device.type == 'cuda':
-                yolo_model.to(device)
-        with st.spinner('Đang tải mô hình độ sâu...'):
-            depth_pro = Depth_pro()
-            depth_model, transform = depth_pro.create_model_and_transforms()
-            if device.type == 'cuda':
-                depth_model.to(device)
-        # Mở video
-        cap = cv2.VideoCapture(video_path)
-        # Lấy thông tin video
-        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        fps = cap.get(cv2.CAP_PROP_FPS)
-        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        # Tạo tệp đầu ra
-        temp_output_dir = tempfile.mkdtemp()
-        output_video_path = os.path.join(temp_output_dir, "person_detection_with_depth.mp4")
-        output_depth_path = os.path.join(temp_output_dir, "depth_colormap.mp4")
-        # Thiết lập writers
-        fourcc = cv2.VideoWriter_fourcc(*'XVID')
-        out_detection = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
-        out_depth = cv2.VideoWriter(output_depth_path, fourcc, fps, (width, height))
-        # Ước tính chiều dài tiêu cự
-        focal_length_px = torch.tensor([max(width, height)], device=device)
-        # Cột hiển thị khung hình đang xử lý
-        preview_col1, preview_col2 = st.columns(2)
-        detection_placeholder = preview_col1.empty()
-        depth_placeholder = preview_col2.empty()
-        frame_counter = 0
-        start_time = time.time()
-        depth_np = None
-        try:
-            while cap.isOpened():
-                ret, frame = cap.read()
-                if not ret:
-                    break
-                frame_counter += 1
-                # Cập nhật tiến trình
-                progress = int(frame_counter / total_frames * 100)
-                progress_bar.progress(progress)
-                if frame_counter % 10 == 0:
-                    elapsed_time = time.time() - start_time
-                    frames_left = total_frames - frame_counter
-                    est_time_left = (elapsed_time / frame_counter) * frames_left if frame_counter > 0 else 0
-                    status_text.text(f"Đang xử lý khung hình {frame_counter}/{total_frames} - Thời gian còn lại: {est_time_left:.2f}s")
-                # Phát hiện người với YOLO
-                results = yolo_model(frame, conf=confidence)
-                person_boxes = []
-                for result in results:
-                    boxes = result.boxes.xyxy.cpu().numpy()
-                    classes = result.boxes.cls.cpu().numpy()
-                    confs = result.boxes.conf.cpu().numpy()
-                    for box, cls, conf in zip(boxes, classes, confs):
-                        if result.names[int(cls)] == "person" and conf > confidence:
-                            x1, y1, x2, y2 = map(int, box[:4])
-                            person_boxes.append((x1, y1, x2, y2))
-                            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
-                # Cập nhật độ sâu theo khoảng đã thiết lập
-                if frame_counter % depth_update_interval == 0 or frame_counter == 1 or depth_np is None:
-                    # Chuyển đổi khung hình cho model độ sâu
-                    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                    pil_image = Image.fromarray(rgb_frame)
-                    depth_input = transform(pil_image)
-                    if device.type == 'cuda':
-                        depth_input = depth_input.to(device)
-                    # Ước tính độ sâu
-                    with torch.no_grad():
-                        predictions = depth_model.infer(depth_input, f_px=focal_length_px)
-                        depth = predictions["depth"]
-                    depth_np = depth.squeeze().cpu().numpy()
-                    # Điều chỉnh kích thước nếu cần
-                    if depth_np.shape[:2] != (height, width):
-                        depth_np = cv2.resize(depth_np, (width, height), interpolation=cv2.INTER_LINEAR)
-                # Tạo bản đồ màu độ sâu
-                depth_colormap = create_depth_colormap(depth_np)
-                # Thêm thông tin độ sâu cho người đã phát hiện
-                for x1, y1, x2, y2 in person_boxes:
-                    center_x = (x1 + x2) // 2
-                    center_y = (y1 + y2) // 2
-                    # Đảm bảo tọa độ trong giới hạn
-                    center_x = min(center_x, depth_np.shape[1] - 1)
-                    center_y = min(center_y, depth_np.shape[0] - 1)
-                    depth_value = depth_np[center_y, center_x]
-                    # Vẽ thông tin độ sâu
-                    text = f"Độ sâu: {depth_value:.2f}m"
-                    draw_depth_label(frame, text, (x1, y1))
-                # Hiển thị khung hình trong Streamlit (cập nhật theo khoảng)
-                if frame_counter % 5 == 0 or frame_counter == 1:
-                    detection_placeholder.image(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), caption="Phát hiện người", use_container_width=True)
-                    depth_placeholder.image(depth_colormap, caption="Bản đồ độ sâu", use_container_width=True)
-                # Ghi khung hình vào videos
-                out_detection.write(frame)
-                out_depth.write(cv2.cvtColor(depth_colormap, cv2.COLOR_RGB2BGR))
-        finally:
-            # Giải phóng tài nguyên
-            cap.release()
-            out_detection.release()
-            out_depth.release()
-            total_time = time.time() - start_time
-            st.success(f"Xử lý hoàn tất! Tổng thời gian: {total_time:.2f}s")
-            st.success(f"FPS trung bình: {frame_counter / total_time:.2f}")
-            # Hiển thị videos đã xử lý
-            st.subheader("Videos kết quả")
-            col1, col2 = st.columns(2)
-            with col1:
-                st.video(output_video_path)
-                st.download_button(
-                    label="Tải xuống video phát hiện",
-                    data=open(output_video_path, 'rb').read(),
-                    file_name="person_detection_with_depth.mp4",
-                    mime="video/mp4"
-                )
-            with col2:
-                st.video(output_depth_path)
-                st.download_button(
-                    label="Tải xuống bản đồ độ sâu",
-                    data=open(output_depth_path, 'rb').read(),
-                    file_name="depth_colormap.mp4",
-                    mime="video/mp4"
-                )
-            # Xóa tệp tạm thời
-            try:
-                os.unlink(video_path)
-            except:
-                pass
-# Giao diện chính
-def main():
-    st.title("Ứng dụng Phát hiện Người và Ước tính Độ sâu")
-    # Chọn chế độ xử lý
-    app_mode = st.sidebar.selectbox("Chọn chế độ:", ["Xử lý Ảnh", "Xử lý Video"])
-    # Hiển thị thông tin
-    st.sidebar.header("Thông tin")
-    st.sidebar.info("""
-    **Mô hình sử dụng:**
-    - Phát hiện người: YOLOv8n
-    - Độ sâu: depth_pro (GLPN-NYU)
-    **Màu sắc trong bản đồ độ sâu:**
-    - Màu đỏ/vàng: Gần
-    - Màu xanh: Xa
-    """)
-    # Chạy chức năng tương ứng
-    if app_mode == "Xử lý Ảnh":
-        process_image()
-    else:
-        process_video()
-# Tạo tệp requirements.txt cho Hugging Face Space
 def create_requirements():
-    return """
-streamlit
-numpy
-Pillow
-opencv-python-headless
-torch
-torchvision
-transformers
-ultralytics
-requests
-"""
 if __name__ == "__main__":
     main()

 import requests
 from io import BytesIO
+# Tạo module depth_pro đơn giản (để thay thế module gốc)
+class DepthPro:
     @staticmethod
     def create_model_and_transforms():
+        # Nhập các thư viện cần thiết ở đây để tránh lỗi khi khởi tạo
         import torch
         from transformers import AutoImageProcessor, AutoModelForDepthEstimation
+        # Tải mô hình depth estimation từ Hugging Face
         processor = AutoImageProcessor.from_pretrained("vinvino02/glpn-nyu")
         model = AutoModelForDepthEstimation.from_pretrained("vinvino02/glpn-nyu")
+        # Tạo hàm transform đơn giản
         def transform(image):
             return processor(images=image, return_tensors="pt").pixel_values
+        # Mở rộng model với phương thức infer
         def infer_method(self, image, f_px=None):
             with torch.no_grad():
                 outputs = self(image)
                 predicted_depth = outputs.predicted_depth
             # Chuẩn hóa độ sâu
             depth_min = torch.min(predicted_depth)
+            depth_max = torch.max(predicted_depth)
             predicted_depth = (predicted_depth - depth_min) / (depth_max - depth_min)
+            predicted_depth = predicted_depth * 10  # Nhân với 10 để có giá trị mét hợp lý hơn
             return {"depth": predicted_depth}
+        # Thêm phương thức infer vào model
         model.infer = infer_method.__get__(model)
         return model, transform
+# Hàm tải mô hình YOLO từ Hugging Face
 @st.cache_resource
 def load_yolo_model():
+    # Sử dụng mô hình YOLOv8n từ Hugging Face
     model = YOLO("yolov8n.pt")
     return model
+# Hàm tải và chuẩn bị mô hình độ sâu
+@st.cache_resource
+def load_depth_model():
+    depth_pro = DepthPro()
+    model, transform = depth_pro.create_model_and_transforms()
+    return model, transform
+# Hàm xử lý video
+def process_video(video_path):
+    # Kiểm tra CUDA
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    st.info(f"Đang sử dụng thiết bị: {device}")
+    # Tải mô hình YOLO
+    with st.spinner('Đang tải mô hình YOLO...'):
+        yolo_model = load_yolo_model()
+        if device.type == 'cuda':
+            yolo_model.to(device)
+    # Tải mô hình độ sâu
+    with st.spinner('Đang tải mô hình độ sâu...'):
+        depth_model, transform = load_depth_model()
+        depth_model.eval()
+        if device.type == 'cuda':
+            depth_model.to(device)
+    # Mở video để xử lý
+    cap = cv2.VideoCapture(video_path)
+    # Lấy thuộc tính video cho đầu ra
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    # Tạo tệp tạm thời cho video đầu ra
+    temp_output_dir = tempfile.mkdtemp()
+    output_video_path = os.path.join(temp_output_dir, "person_detection_with_depth.mp4")
+    output_depth_path = os.path.join(temp_output_dir, "depth_colormap.mp4")
+    # Sử dụng codec phù hợp với môi trường Hugging Face
+    fourcc = cv2.VideoWriter_fourcc(*'XVID')  # Thay đổi từ mp4v sang XVID cho tương thích tốt hơn
+    out_detection = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
+    out_depth = cv2.VideoWriter(output_depth_path, fourcc, fps, (width, height))
+    # Ước tính chiều dài tiêu cự và chuyển đổi sang tensor
+    focal_length_px = torch.tensor([max(width, height)], device=device)
+    # Hiển thị thanh tiến trình
+    progress_bar = st.progress(0)
+    progress_text = st.empty()
+    frame_counter = 0
+    start_time = time.time()
+    # Tạo cột để hiển thị khung video
+    col1, col2 = st.columns(2)
+    detection_placeholder = col1.empty()
+    depth_placeholder = col2.empty()
+    # Giảm kích thước frame để tăng tốc độ xử lý
+    target_width = 640  # Kích thước đích
+    scale_factor = target_width / width
+    target_height = int(height * scale_factor)
+    try:
+        while cap.isOpened():
+            ret, frame = cap.read()
+            if not ret:
+                break
+            frame_counter += 1
+            # Cập nhật tiến trình
+            progress = int(frame_counter / total_frames * 100)
+            progress_bar.progress(progress)
+            if frame_counter % 10 == 0:  # Hiển thị tiến trình mỗi 10 khung hình
+                elapsed_time = time.time() - start_time
+                frames_left = total_frames - frame_counter
+                est_time_left = (elapsed_time / frame_counter) * frames_left if frame_counter > 0 else 0
+                progress_text.text(f"Đang xử lý khung hình {frame_counter}/{total_frames} - Thời gian còn lại: {est_time_left:.2f}s")
+            # Giảm kích thước khung hình để tăng tốc xử lý
+            if scale_factor < 1:
+                frame_resized = cv2.resize(frame, (target_width, target_height))
             else:
+                frame_resized = frame
+            # Phát hiện YOLO
+            results = yolo_model(frame_resized)
+            person_boxes = []
+            for result in results:
+                boxes = result.boxes.xyxy.cpu().numpy()
+                classes = result.boxes.cls.cpu().numpy()
+                confs = result.boxes.conf.cpu().numpy()
+                for box, cls, conf in zip(boxes, classes, confs):
+                    if result.names[int(cls)] == "person" and conf > 0.5:  # Thêm ngưỡng tin cậy
+                        if scale_factor < 1:  # Điều chỉnh lại khung giới hạn nếu đã thay đổi kích thước
+                            x1, y1, x2, y2 = map(int, [box[0]/scale_factor, box[1]/scale_factor,
+                                                      box[2]/scale_factor, box[3]/scale_factor])
+                        else:
+                            x1, y1, x2, y2 = map(int, box[:4])
+                        person_boxes.append((x1, y1, x2, y2))
+                        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
+            # Chuyển đổi khung hình cho đầu vào mô hình độ sâu
+            rgb_frame = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
+            pil_image = Image.fromarray(rgb_frame)
             depth_input = transform(pil_image)
             if device.type == 'cuda':
                 depth_input = depth_input.to(device)
             # Ước tính độ sâu
             with torch.no_grad():
                 predictions = depth_model.infer(depth_input, f_px=focal_length_px)
+                depth = predictions["depth"]  # Độ sâu theo [m]
             depth_np = depth.squeeze().cpu().numpy()
+            # Điều chỉnh lại kích thước bản đồ độ sâu
+            if scale_factor < 1:
+                depth_np = cv2.resize(depth_np, (width, height), interpolation=cv2.INTER_LINEAR)
             # Tạo bản đồ màu độ sâu
+            depth_np_normalized = (depth_np - depth_np.min()) / (depth_np.max() - depth_np.min())
+            inv_depth_np_normalized = 1 - depth_np_normalized
+            depth_colormap = cv2.applyColorMap((inv_depth_np_normalized * 255).astype(np.uint8), cv2.COLORMAP_TURBO)
+            # Thêm giá trị độ sâu cho người được phát hiện
+            for x1, y1, x2, y2 in person_boxes:
+                center_x = (x1 + x2) // 2
+                center_y = (y1 + y2) // 2
+                # Đảm bảo tọa độ nằm trong giới hạn
+                center_x = min(center_x, depth_np.shape[1] - 1)
+                center_y = min(center_y, depth_np.shape[0] - 1)
+                depth_value = depth_np[center_y, center_x]
+                text = f"Độ sâu: {depth_value:.2f} m"
+                font = cv2.FONT_HERSHEY_SIMPLEX
+                font_scale = 0.8  # Giảm kích thước font để phù hợp
+                font_thickness = 2
+                text_size = cv2.getTextSize(text, font, font_scale, font_thickness)[0]
+                text_x = x1
+                text_y = y1 - 10
+                rect_x1 = text_x - 5
+                rect_y1 = text_y - text_size[1] - 10
+                rect_x2 = text_x + text_size[0] + 5
+                rect_y2 = text_y + 5
+                cv2.rectangle(frame, (rect_x1, rect_y1), (rect_x2, rect_y2), (0, 255, 0), -1)
+                cv2.putText(frame, text, (text_x, text_y), font, font_scale, (0, 0, 0), font_thickness)
+            # Hiển thị khung hình trong Streamlit (cập nhật mỗi 5 khung hình để tránh làm chậm)
+            if frame_counter % 5 == 0:
+                detection_placeholder.image(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), caption="Phát hiện người", use_column_width=True)
+                depth_placeholder.image(depth_colormap, caption="Bản đồ độ sâu", use_column_width=True)
+            # Ghi khung hình vào video đầu ra
+            out_detection.write(frame)
+            out_depth.write(depth_colormap)
+    finally:
+        # Giải phóng tài nguyên
+        cap.release()
+        out_detection.release()
+        out_depth.release()
+        total_time = time.time() - start_time
+        st.success(f"Xử lý hoàn tất! Tổng thời gian: {total_time:.2f}s")
+        st.success(f"FPS trung bình: {frame_counter / total_time:.2f}")
+        return output_video_path, output_depth_path
+# Giao diện Streamlit chính
+def main():
+    st.title("Ứng dụng Phát hiện Người và Ước tính Độ sâu")
+    st.write("Tải lên video để phát hiện người và hiển thị thông tin độ sâu")
+    # Tùy chọn video mẫu
+    st.sidebar.header("Tùy chọn")
+    use_sample = st.sidebar.checkbox("Sử dụng video mẫu")
     video_path = None
+        # Tải lên tệp video
+    uploaded_file = st.file_uploader("Chọn một tệp video", type=['mp4', 'avi', 'mov'])
+    if uploaded_file is not None:
+            # Lưu tệp đã tải lên vào thư mục tạm thời
             temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
+            temp_file.write(uploaded_file.read())
             video_path = temp_file.name
             temp_file.close()
             st.video(video_path)
+    # Hiển thị thông tin về mô hình
+    st.sidebar.header("Thông tin mô hình")
+    st.sidebar.markdown("""
+    - Phát hiện người: YOLOv8n
+    - Ước tính độ sâu: GLPN-NYU từ HuggingFace
+    """)
+    # Thêm tùy chọn cho độ tin cậy phát hiện
+    confidence = st.sidebar.slider("Ngưỡng tin cậy", 0.0, 1.0, 0.5)
+    # Nút để bắt đầu xử lý
+    if video_path and st.button("Xử lý Video"):
+        with st.spinner("Đang xử lý video..."):
+            detection_video_path, depth_video_path = process_video(video_path)
+        # Hiển thị video đã xử lý
+        st.subheader("Video đã xử lý")
+        col1, col2 = st.columns(2)
+        with col1:
+            st.video(detection_video_path)
+            st.download_button(
+                label="Tải xuống video phát hiện",
+                data=open(detection_video_path, 'rb').read(),
+                file_name="person_detection_with_depth.mp4",
+                mime="video/mp4"
+            )
+        with col2:
+            st.video(depth_video_path)
+            st.download_button(
+                label="Tải xuống bản đồ độ sâu",
+                data=open(depth_video_path, 'rb').read(),
+                file_name="depth_colormap.mp4",
+                mime="video/mp4"
+            )
+        # Xóa tệp tạm thời
+        os.unlink(video_path)
+# Tệp requirements.txt
 def create_requirements():
+    requirements = """
+    streamlit
+    numpy
+    Pillow
+    opencv-python
+    torch
+    torchvision
+    transformers
+    ultralytics
+    requests
+    opencv-python
+    """
+    return requirements
 if __name__ == "__main__":
     main()