Spaces:

vijulshah
/

pupilsense

Running

App Files Files Community

vijul.shah commited on Sep 18, 2024

Commit

9acc552

1 Parent(s): f0adec0

Input Video and Predictions as output video added

Browse files

Files changed (8) hide show

app.py +63 -381
app_old.py +434 -0
app_utils.py +374 -0
config.yml +2 -1
feature_extraction/extractor_mediapipe.py +12 -38
feature_extraction/features_extractor.py +1 -3
image.py +32 -0
video.py +48 -0

app.py CHANGED Viewed

@@ -1,431 +1,113 @@
-# takn from: https://huggingface.co/spaces/frgfm/torch-cam/blob/main/app.py
-# streamlit run app.py
-from io import BytesIO
 import os
 import sys
-import cv2
-import matplotlib.pyplot as plt
-import numpy as np
-import streamlit as st
-import torch
 import tempfile
-from PIL import Image
-from torchvision import models
-from torchvision.transforms.functional import normalize, resize, to_pil_image, to_tensor
-from torchvision import transforms
-from torchcam.methods import CAM
-from torchcam import methods as torchcam_methods
-from torchcam.utils import overlay_mask
 import os.path as osp
 root_path = osp.abspath(osp.join(__file__, osp.pardir))
 sys.path.append(root_path)
-from preprocessing.dataset_creation import EyeDentityDatasetCreation
-from utils import get_model
 from registry_utils import import_registered_modules
 import_registered_modules()
-# from torchcam.methods._utils import locate_candidate_layer
-CAM_METHODS = [
-    "CAM",
-    # "GradCAM",
-    # "GradCAMpp",
-    # "SmoothGradCAMpp",
-    # "ScoreCAM",
-    # "SSCAM",
-    # "ISCAM",
-    # "XGradCAM",
-    # "LayerCAM",
-]
-TV_MODELS = [
-    "ResNet18",
-    "ResNet50",
-]
 SR_METHODS = ["GFPGAN", "CodeFormer", "RealESRGAN", "SRResNet", "HAT"]
 UPSCALE = [2, 4]
 UPSCALE_METHODS = ["BILINEAR", "BICUBIC"]
 LABEL_MAP = ["left_pupil", "right_pupil"]
-@torch.no_grad()
-def _load_model(model_configs, device="cpu"):
-    model_path = os.path.join(root_path, model_configs["model_path"])
-    model_configs.pop("model_path")
-    model_dict = torch.load(model_path, map_location=device)
-    model = get_model(model_configs=model_configs)
-    model.load_state_dict(model_dict)
-    model = model.to(device)
-    model = model.eval()
-    return model
-def extract_frames(video_path):
-    vidcap = cv2.VideoCapture(video_path)
-    frames = []
-    success, image = vidcap.read()
-    count = 0
-    while success:
-        # Convert the frame to RGB (cv2 uses BGR by default)
-        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-        frames.append(image_rgb)
-        success, image = vidcap.read()
-        count += 1
-    vidcap.release()
-    return frames
-# Function to check if a file is an image
-def is_image(file_extension):
-    return file_extension.lower() in ["png", "jpeg", "jpg"]
-# Function to check if a file is a video
-def is_video(file_extension):
-    return file_extension.lower() in ["mp4", "avi", "mov", "mkv", "webm"]
-def resize_frame(frame, max_width, max_height):
-    image = Image.fromarray(frame)
-    original_size = image.size
-    # Resize the frame similarly to the image resizing logic
-    if original_size[0] == original_size[1] and original_size[0] >= 256:
-        max_size = (256, 256)
-    else:
-        max_size = list(original_size)
-        if original_size[0] >= 640:
-            max_size[0] = 640
-        elif original_size[0] < 64:
-            max_size[0] = 64
-        if original_size[1] >= 480:
-            max_size[1] = 480
-        elif original_size[1] < 32:
-            max_size[1] = 32
-    image.thumbnail(max_size)
-    return image
 def main():
-    # Wide mode
     st.set_page_config(page_title="Pupil Diameter Estimator", layout="wide")
-    # Designing the interface
     st.title("EyeDentify Playground")
-    # For newline
-    st.write("\n")
-    # Set the columns
     cols = st.columns((1, 1))
-    # cols = st.columns((1, 1, 1))
-    cols[0].header("Input image")
-    # cols[1].header("Raw CAM")
     cols[-1].header("Prediction")
-    # Sidebar
-    # File selection
     st.sidebar.title("Upload Face or Eye")
-    # Disabling warning
-    st.set_option("deprecation.showfileUploaderEncoding", False)
-    # Choose your own image
     uploaded_file = st.sidebar.file_uploader(
         "Upload Image or Video", type=["png", "jpeg", "jpg", "mp4", "avi", "mov", "mkv", "webm"]
     )
     if uploaded_file is not None:
-        # Get file extension
         file_extension = uploaded_file.name.split(".")[-1]
-        input_imgs = []
         if is_image(file_extension):
-            input_img = Image.open(BytesIO(uploaded_file.read()), mode="r").convert("RGB")
-            # print("input_img before = ", input_img.size)
-            max_size = [input_img.size[0], input_img.size[1]]
-            cols[0].text(f"Input Image: {max_size[0]} x {max_size[1]}")
-            if input_img.size[0] == input_img.size[1] and input_img.size[0] >= 256:
-                max_size[0] = 256
-                max_size[1] = 256
-            else:
-                if input_img.size[0] >= 640:
-                    max_size[0] = 640
-                elif input_img.size[0] < 64:
-                    max_size[0] = 64
-                if input_img.size[1] >= 480:
-                    max_size[1] = 480
-                elif input_img.size[1] < 32:
-                    max_size[1] = 32
-            input_img.thumbnail((max_size[0], max_size[1]))  # Bicubic resampling
-            input_imgs.append(input_img)
-            # print("input_img after = ", input_img.size)
-            # cols[0].image(input_img)
-            fig0, axs0 = plt.subplots(1, 1, figsize=(10, 10))
-            # Display the input image
-            axs0.imshow(input_imgs[0])
-            axs0.axis("off")
-            axs0.set_title("Input Image")
-            # Display the plot
-            cols[0].pyplot(fig0)
-            cols[0].text(f"Input Image Resized: {max_size[0]} x {max_size[1]}")
-            # TODO: show the face features extracted from the image under 'input image' column
         elif is_video(file_extension):
             tfile = tempfile.NamedTemporaryFile(delete=False)
             tfile.write(uploaded_file.read())
             video_path = tfile.name
-            # Extract frames from the video
-            frames = extract_frames(video_path)
-            print(f"Extracted {len(frames)} frames from the video")
-            # Process the frames
-            for i, frame in enumerate(frames):
-                input_imgs.append(resize_frame(frame, 640, 480))
-            os.remove(video_path)
-            fig0, axs0 = plt.subplots(1, 1, figsize=(10, 10))
-            # Display the input image
-            axs0.imshow(input_imgs[0])
-            axs0.axis("off")
-            axs0.set_title("Input Image")
-            # Display the plot
-            cols[0].pyplot(fig0)
-            # cols[0].text(f"Input Image Resized: {max_size[0]} x {max_size[1]}")
     st.sidebar.title("Setup")
-    # Upscale selection
-    upscale = "-"
-    # upscale = st.sidebar.selectbox(
-    #     "Upscale",
-    #     ["-"] + UPSCALE,
-    #     help="Upscale the uploaded image 2 or 4 times. Keep blank for no upscaling",
-    # )
-    # Upscale method selection
-    if upscale != "-":
-        upscale_method_or_model = st.sidebar.selectbox(
-            "Upscale Method / Model",
-            UPSCALE_METHODS + SR_METHODS,
-            help="Select a method or model to upscale the uploaded image",
-        )
-    else:
-        upscale_method_or_model = None
-    # Pupil selection
     pupil_selection = st.sidebar.selectbox(
-        "Pupil Selection",
-        ["-"] + LABEL_MAP,
-        help="Select left or right pupil OR keep blank for both pupil diameter estimation",
-    )
-    # Model selection
-    tv_model = st.sidebar.selectbox(
-        "Classification model",
-        TV_MODELS,
-        help="Supported Models for Pupil Diameter Estimation",
     )
-    cam_method = "CAM"
-    # cam_method = st.sidebar.selectbox(
-    #     "CAM method",
-    #     CAM_METHODS,
-    #     help="The way your class activation map will be computed",
-    # )
-    # target_layer = st.sidebar.text_input(
-    #     "Target layer",
-    #     default_layer,
-    #     help='If you want to target several layers, add a "+" separator (e.g. "layer3+layer4")',
-    # )
-    st.sidebar.write("\n")
     if st.sidebar.button("Predict Diameter & Compute CAM"):
         if uploaded_file is None:
-            st.sidebar.error("Please upload an image first")
         else:
             with st.spinner("Analyzing..."):
-                model = None
-                for input_img in input_imgs:
-                    if upscale == "-":
-                        sr_configs = None
-                    else:
-                        sr_configs = {
-                            "method": upscale_method_or_model,
-                            "params": {"upscale": upscale},
-                        }
-                    config_file = {
-                        "sr_configs": sr_configs,
-                        "feature_extraction_configs": {
-                            "blink_detection": False,
-                            "upscale": upscale,
-                            "extraction_library": "mediapipe",
-                        },
-                    }
-                    img = np.array(input_img)
-                    # img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
-                    # if img.shape[0] > max_size or img.shape[1] > max_size:
-                    #     img = cv2.resize(img, (max_size, max_size))
-                    ds_results = EyeDentityDatasetCreation(
-                        feature_extraction_configs=config_file["feature_extraction_configs"],
-                        sr_configs=config_file["sr_configs"],
-                    )(img)
-                    # if ds_results is not None:
-                    # print("ds_results = ", ds_results.keys())
-                    preprocess_steps = [
-                        transforms.ToTensor(),
-                        transforms.Resize(
-                            [32, 64],
-                            # interpolation=transforms.InterpolationMode.BILINEAR,
-                            interpolation=transforms.InterpolationMode.BICUBIC,
-                            antialias=True,
-                        ),
-                    ]
-                    preprocess_function = transforms.Compose(preprocess_steps)
-                    left_eye = None
-                    right_eye = None
-                    if ds_results is None:
-                        # print("type of input_img = ", type(input_img))
-                        input_img = preprocess_function(input_img)
-                        input_img = input_img.unsqueeze(0)
-                        if pupil_selection == "left_pupil":
-                            left_eye = input_img
-                        elif pupil_selection == "right_pupil":
-                            right_eye = input_img
-                        else:
-                            left_eye = input_img
-                            right_eye = input_img
-                        # print("type of left_eye = ", type(left_eye))
-                        # print("type of right_eye = ", type(right_eye))
-                    elif "eyes" in ds_results.keys():
-                        if "left_eye" in ds_results["eyes"].keys() and ds_results["eyes"]["left_eye"] is not None:
-                            left_eye = ds_results["eyes"]["left_eye"]
-                            # print("type of left_eye = ", type(left_eye))
-                            left_eye = to_pil_image(left_eye).convert("RGB")
-                            # print("type of left_eye = ", type(left_eye))
-                            left_eye = preprocess_function(left_eye)
-                            # print("type of left_eye = ", type(left_eye))
-                            left_eye = left_eye.unsqueeze(0)
-                        if "right_eye" in ds_results["eyes"].keys() and ds_results["eyes"]["right_eye"] is not None:
-                            right_eye = ds_results["eyes"]["right_eye"]
-                            # print("type of right_eye = ", type(right_eye))
-                            right_eye = to_pil_image(right_eye).convert("RGB")
-                            # print("type of right_eye = ", type(right_eye))
-                            right_eye = preprocess_function(right_eye)
-                            # print("type of right_eye = ", type(right_eye))
-                            right_eye = right_eye.unsqueeze(0)
-                    else:
-                        # print("type of input_img = ", type(input_img))
-                        input_img = preprocess_function(input_img)
-                        input_img = input_img.unsqueeze(0)
-                        if pupil_selection == "left_pupil":
-                            left_eye = input_img
-                        elif pupil_selection == "right_pupil":
-                            right_eye = input_img
-                        else:
-                            left_eye = input_img
-                            right_eye = input_img
-                        # print("type of left_eye = ", type(left_eye))
-                        # print("type of right_eye = ", type(right_eye))
-                    # print("left_eye = ", left_eye.shape)
-                    # print("right_eye = ", right_eye.shape)
-                    if pupil_selection == "-":
-                        selected_eyes = ["left_eye", "right_eye"]
-                    elif pupil_selection == "left_pupil":
-                        selected_eyes = ["left_eye"]
-                    elif pupil_selection == "right_pupil":
-                        selected_eyes = ["right_eye"]
-                    for eye_type in selected_eyes:
-                        if model is None:
-                            model_configs = {
-                                "model_path": root_path + f"/pre_trained_models/{tv_model}/{eye_type}.pt",
-                                "registered_model_name": tv_model,
-                                "num_classes": 1,
-                            }
-                            registered_model_name = model_configs["registered_model_name"]
-                            model = _load_model(model_configs)
-                        if registered_model_name == "ResNet18":
-                            target_layer = model.resnet.layer4[-1].conv2
-                        elif registered_model_name == "ResNet50":
-                            target_layer = model.resnet.layer4[-1].conv3
-                        else:
-                            raise Exception(f"No target layer available for selected model: {registered_model_name}")
-                        if left_eye is not None and eye_type == "left_eye":
-                            input_img = left_eye
-                        elif right_eye is not None and eye_type == "right_eye":
-                            input_img = right_eye
-                        else:
-                            raise Exception("Wrong Data")
-                        if cam_method is not None:
-                            cam_extractor = torchcam_methods.__dict__[cam_method](
-                                model,
-                                target_layer=target_layer,
-                                fc_layer=model.resnet.fc,
-                                input_shape=input_img.shape,
-                            )
-                        # with torch.no_grad():
-                        out = model(input_img)
-                        cols[-1].markdown(
-                            f"<h3>Predicted Pupil Diameter: {out[0].item():.2f} mm</h3>",
-                            unsafe_allow_html=True,
-                        )
-                        # cols[-1].text(f"Predicted Pupil Diameter: {out[0].item():.2f}")
-                        # Retrieve the CAM
-                        act_maps = cam_extractor(0, out)
-                        # Fuse the CAMs if there are several
-                        activation_map = act_maps[0] if len(act_maps) == 1 else cam_extractor.fuse_cams(act_maps)
-                        # Convert input image and activation map to PIL images
-                        input_image_pil = to_pil_image(input_img.squeeze(0))
-                        activation_map_pil = to_pil_image(activation_map, mode="F")
-                        # Create the overlayed CAM result
-                        result = overlay_mask(
-                            input_image_pil,
-                            activation_map_pil,
-                            alpha=0.5,
-                        )
-                        # Create a subplot with 1 row and 2 columns
-                        fig, axs = plt.subplots(1, 2, figsize=(10, 5))
-                        # Display the input image
-                        axs[0].imshow(input_image_pil)
-                        axs[0].axis("off")
-                        axs[0].set_title("Input Image")
-                        # Display the overlayed CAM result
-                        axs[1].imshow(result)
-                        axs[1].axis("off")
-                        axs[1].set_title("Overlayed CAM")
-                        # Display the plot
-                        cols[-1].pyplot(fig)
-                        cols[-1].text(f"eye image size: {input_img.shape[-1]} x {input_img.shape[-2]}")
 if __name__ == "__main__":

 import os
 import sys
 import tempfile
 import os.path as osp
+from PIL import Image
+from io import BytesIO
+import numpy as np
+import streamlit as st
+from PIL import ImageOps
+from matplotlib import pyplot as plt
 root_path = osp.abspath(osp.join(__file__, osp.pardir))
 sys.path.append(root_path)
 from registry_utils import import_registered_modules
+from app_utils import (
+    extract_frames,
+    is_image,
+    is_video,
+    display_results,
+    overlay_text_on_frame,
+    process_frames,
+    process_video,
+    resize_frame,
+)
 import_registered_modules()
+CAM_METHODS = ["CAM"]
+TV_MODELS = ["ResNet18", "ResNet50"]
 SR_METHODS = ["GFPGAN", "CodeFormer", "RealESRGAN", "SRResNet", "HAT"]
 UPSCALE = [2, 4]
 UPSCALE_METHODS = ["BILINEAR", "BICUBIC"]
 LABEL_MAP = ["left_pupil", "right_pupil"]
 def main():
     st.set_page_config(page_title="Pupil Diameter Estimator", layout="wide")
     st.title("EyeDentify Playground")
     cols = st.columns((1, 1))
+    cols[0].header("Input")
     cols[-1].header("Prediction")
     st.sidebar.title("Upload Face or Eye")
     uploaded_file = st.sidebar.file_uploader(
         "Upload Image or Video", type=["png", "jpeg", "jpg", "mp4", "avi", "mov", "mkv", "webm"]
     )
     if uploaded_file is not None:
         file_extension = uploaded_file.name.split(".")[-1]
         if is_image(file_extension):
+            input_img = Image.open(BytesIO(uploaded_file.read())).convert("RGB")
+            # NOTE: images taken with phone camera has an EXIF data field which often rotates images taken with the phone in a tilted position. PIL has a utility function that removes this data and ‘uprights’ the image.
+            input_img = ImageOps.exif_transpose(input_img)
+            input_img = resize_frame(input_img, max_width=640, max_height=480)
+            input_img = resize_frame(input_img, max_width=640, max_height=480)
+            cols[0].image(input_img, use_column_width=True)
+            input_img.save("out.jpg")
         elif is_video(file_extension):
             tfile = tempfile.NamedTemporaryFile(delete=False)
             tfile.write(uploaded_file.read())
             video_path = tfile.name
+            video_frames = extract_frames(video_path)
+            cols[0].video(video_path)
     st.sidebar.title("Setup")
     pupil_selection = st.sidebar.selectbox(
+        "Pupil Selection", ["both"] + LABEL_MAP, help="Select left or right pupil OR both for diameter estimation"
     )
+    tv_model = st.sidebar.selectbox("Classification model", ["ResNet18", "ResNet50"], help="Supported Models")
     if st.sidebar.button("Predict Diameter & Compute CAM"):
         if uploaded_file is None:
+            st.sidebar.error("Please upload an image or video")
         else:
             with st.spinner("Analyzing..."):
+                if is_image(file_extension):
+                    input_frames, output_frames, predicted_diameters, face_frames = process_frames(
+                        [input_img], tv_model, pupil_selection, cam_method=CAM_METHODS[-1]
+                    )
+                    for ff in face_frames:
+                        if ff["has_face"]:
+                            cols[1].image(face_frames[0]["img"], use_column_width=True)
+                    input_frames_keys = input_frames.keys()
+                    video_cols = cols[1].columns(len(input_frames_keys))
+                    for i, eye_type in enumerate(input_frames_keys):
+                        video_cols[i].image(input_frames[eye_type][-1], use_column_width=True)
+                    output_frames_keys = output_frames.keys()
+                    fig, axs = plt.subplots(1, len(output_frames_keys), figsize=(10, 5))
+                    for i, eye_type in enumerate(output_frames_keys):
+                        height, width, c = output_frames[eye_type][0].shape
+                        video_cols[i].image(output_frames[eye_type][-1], use_column_width=True)
+                        frame = np.zeros((height, width, c), dtype=np.uint8)
+                        text = f"{predicted_diameters[eye_type][0]:.2f}"
+                        frame = overlay_text_on_frame(frame, text)
+                        video_cols[i].image(frame, use_column_width=True)
+                elif is_video(file_extension):
+                    output_video_path = f"{root_path}/tmp.webm"
+                    process_video(
+                        cols, video_frames, tv_model, pupil_selection, output_video_path, cam_method=CAM_METHODS[-1]
+                    )
+                    os.remove(video_path)
 if __name__ == "__main__":

app_old.py ADDED Viewed

	@@ -0,0 +1,434 @@

+# takn from: https://huggingface.co/spaces/frgfm/torch-cam/blob/main/app.py
+# streamlit run app.py
+from io import BytesIO
+import os
+import sys
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+import streamlit as st
+import torch
+import tempfile
+from PIL import Image
+from torchvision import models
+from torchvision.transforms.functional import normalize, resize, to_pil_image, to_tensor
+from torchvision import transforms
+from torchcam.methods import CAM
+from torchcam import methods as torchcam_methods
+from torchcam.utils import overlay_mask
+import os.path as osp
+root_path = osp.abspath(osp.join(__file__, osp.pardir))
+sys.path.append(root_path)
+from preprocessing.dataset_creation import EyeDentityDatasetCreation
+from utils import get_model
+from registry_utils import import_registered_modules
+import_registered_modules()
+# from torchcam.methods._utils import locate_candidate_layer
+CAM_METHODS = [
+    "CAM",
+    # "GradCAM",
+    # "GradCAMpp",
+    # "SmoothGradCAMpp",
+    # "ScoreCAM",
+    # "SSCAM",
+    # "ISCAM",
+    # "XGradCAM",
+    # "LayerCAM",
+]
+TV_MODELS = [
+    "ResNet18",
+    "ResNet50",
+]
+SR_METHODS = ["GFPGAN", "CodeFormer", "RealESRGAN", "SRResNet", "HAT"]
+UPSCALE = [2, 4]
+UPSCALE_METHODS = ["BILINEAR", "BICUBIC"]
+LABEL_MAP = ["left_pupil", "right_pupil"]
+@torch.no_grad()
+def _load_model(model_configs, device="cpu"):
+    model_path = os.path.join(root_path, model_configs["model_path"])
+    model_configs.pop("model_path")
+    model_dict = torch.load(model_path, map_location=device)
+    model = get_model(model_configs=model_configs)
+    model.load_state_dict(model_dict)
+    model = model.to(device)
+    model = model.eval()
+    return model
+def extract_frames(video_path):
+    vidcap = cv2.VideoCapture(video_path)
+    frames = []
+    success, image = vidcap.read()
+    count = 0
+    while success:
+        # Convert the frame to RGB (cv2 uses BGR by default)
+        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        frames.append(image_rgb)
+        success, image = vidcap.read()
+        count += 1
+    vidcap.release()
+    return frames
+# Function to check if a file is an image
+def is_image(file_extension):
+    return file_extension.lower() in ["png", "jpeg", "jpg"]
+# Function to check if a file is a video
+def is_video(file_extension):
+    return file_extension.lower() in ["mp4", "avi", "mov", "mkv", "webm"]
+def resize_frame(frame, max_width, max_height):
+    image = Image.fromarray(frame)
+    original_size = image.size
+    # Resize the frame similarly to the image resizing logic
+    if original_size[0] == original_size[1] and original_size[0] >= 256:
+        max_size = (256, 256)
+    else:
+        max_size = list(original_size)
+        if original_size[0] >= 640:
+            max_size[0] = 640
+        elif original_size[0] < 64:
+            max_size[0] = 64
+        if original_size[1] >= 480:
+            max_size[1] = 480
+        elif original_size[1] < 32:
+            max_size[1] = 32
+    image.thumbnail(max_size)
+    return image
+def main():
+    # Wide mode
+    st.set_page_config(page_title="Pupil Diameter Estimator", layout="wide")
+    # Designing the interface
+    st.title("EyeDentify Playground")
+    # For newline
+    st.write("\n")
+    # Set the columns
+    cols = st.columns((1, 1))
+    # cols = st.columns((1, 1, 1))
+    cols[0].header("Input image")
+    # cols[1].header("Raw CAM")
+    cols[-1].header("Prediction")
+    # Sidebar
+    # File selection
+    st.sidebar.title("Upload Face or Eye")
+    # Disabling warning
+    st.set_option("deprecation.showfileUploaderEncoding", False)
+    # Choose your own image
+    uploaded_file = st.sidebar.file_uploader(
+        "Upload Image or Video", type=["png", "jpeg", "jpg", "mp4", "avi", "mov", "mkv", "webm"]
+    )
+    if uploaded_file is not None:
+        # Get file extension
+        file_extension = uploaded_file.name.split(".")[-1]
+        input_imgs = []
+        if is_image(file_extension):
+            input_img = Image.open(BytesIO(uploaded_file.read()), mode="r").convert("RGB")
+            # print("input_img before = ", input_img.size)
+            max_size = [input_img.size[0], input_img.size[1]]
+            cols[0].text(f"Input Image: {max_size[0]} x {max_size[1]}")
+            if input_img.size[0] == input_img.size[1] and input_img.size[0] >= 256:
+                max_size[0] = 256
+                max_size[1] = 256
+            else:
+                if input_img.size[0] >= 640:
+                    max_size[0] = 640
+                elif input_img.size[0] < 64:
+                    max_size[0] = 64
+                if input_img.size[1] >= 480:
+                    max_size[1] = 480
+                elif input_img.size[1] < 32:
+                    max_size[1] = 32
+            input_img.thumbnail((max_size[0], max_size[1]))  # Bicubic resampling
+            input_imgs.append(input_img)
+            # print("input_img after = ", input_img.size)
+            # cols[0].image(input_img)
+            fig0, axs0 = plt.subplots(1, 1, figsize=(10, 10))
+            # Display the input image
+            axs0.imshow(input_imgs[0])
+            axs0.axis("off")
+            axs0.set_title("Input Image")
+            # Display the plot
+            cols[0].pyplot(fig0)
+            cols[0].text(f"Input Image Resized: {max_size[0]} x {max_size[1]}")
+            # TODO: show the face features extracted from the image under 'input image' column
+        elif is_video(file_extension):
+            tfile = tempfile.NamedTemporaryFile(delete=False)
+            tfile.write(uploaded_file.read())
+            video_path = tfile.name
+            # Extract frames from the video
+            frames = extract_frames(video_path)
+            print(f"Extracted {len(frames)} frames from the video")
+            # Process the frames
+            for i, frame in enumerate(frames):
+                input_imgs.append(resize_frame(frame, 640, 480))
+            os.remove(video_path)
+            fig0, axs0 = plt.subplots(1, 1, figsize=(10, 10))
+            # Display the input image
+            axs0.imshow(input_imgs[0])
+            axs0.axis("off")
+            axs0.set_title("Input Image")
+            # Display the plot
+            cols[0].pyplot(fig0)
+            # cols[0].text(f"Input Image Resized: {max_size[0]} x {max_size[1]}")
+    st.sidebar.title("Setup")
+    # Upscale selection
+    upscale = "-"
+    # upscale = st.sidebar.selectbox(
+    #     "Upscale",
+    #     ["-"] + UPSCALE,
+    #     help="Upscale the uploaded image 2 or 4 times. Keep blank for no upscaling",
+    # )
+    # Upscale method selection
+    if upscale != "-":
+        upscale_method_or_model = st.sidebar.selectbox(
+            "Upscale Method / Model",
+            UPSCALE_METHODS + SR_METHODS,
+            help="Select a method or model to upscale the uploaded image",
+        )
+    else:
+        upscale_method_or_model = None
+    # Pupil selection
+    pupil_selection = st.sidebar.selectbox(
+        "Pupil Selection",
+        ["-"] + LABEL_MAP,
+        help="Select left or right pupil OR keep blank for both pupil diameter estimation",
+    )
+    # Model selection
+    tv_model = st.sidebar.selectbox(
+        "Classification model",
+        TV_MODELS,
+        help="Supported Models for Pupil Diameter Estimation",
+    )
+    cam_method = "CAM"
+    # cam_method = st.sidebar.selectbox(
+    #     "CAM method",
+    #     CAM_METHODS,
+    #     help="The way your class activation map will be computed",
+    # )
+    # target_layer = st.sidebar.text_input(
+    #     "Target layer",
+    #     default_layer,
+    #     help='If you want to target several layers, add a "+" separator (e.g. "layer3+layer4")',
+    # )
+    st.sidebar.write("\n")
+    if st.sidebar.button("Predict Diameter & Compute CAM"):
+        if uploaded_file is None:
+            st.sidebar.error("Please upload an image first")
+        else:
+            with st.spinner("Analyzing..."):
+                model = None
+                for input_img in input_imgs:
+                    if upscale == "-":
+                        sr_configs = None
+                    else:
+                        sr_configs = {
+                            "method": upscale_method_or_model,
+                            "params": {"upscale": upscale},
+                        }
+                    config_file = {
+                        "sr_configs": sr_configs,
+                        "feature_extraction_configs": {
+                            "blink_detection": False,
+                            "upscale": upscale,
+                            "extraction_library": "mediapipe",
+                        },
+                    }
+                    img = np.array(input_img)
+                    # img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+                    # if img.shape[0] > max_size or img.shape[1] > max_size:
+                    #     img = cv2.resize(img, (max_size, max_size))
+                    ds_results = EyeDentityDatasetCreation(
+                        feature_extraction_configs=config_file["feature_extraction_configs"],
+                        sr_configs=config_file["sr_configs"],
+                    )(img)
+                    # if ds_results is not None:
+                    # print("ds_results = ", ds_results.keys())
+                    # NOTE:
+                    # ds_results.keys() contains ===> 'full_imgs', 'faces', 'eyes', 'blinks', 'iris'
+                    preprocess_steps = [
+                        transforms.ToTensor(),
+                        transforms.Resize(
+                            [32, 64],
+                            interpolation=transforms.InterpolationMode.BICUBIC,
+                            antialias=True,
+                        ),
+                    ]
+                    preprocess_function = transforms.Compose(preprocess_steps)
+                    left_eye = None
+                    right_eye = None
+                    if ds_results is None:
+                        # print("type of input_img = ", type(input_img))
+                        input_img = preprocess_function(input_img)
+                        input_img = input_img.unsqueeze(0)
+                        if pupil_selection == "left_pupil":
+                            left_eye = input_img
+                        elif pupil_selection == "right_pupil":
+                            right_eye = input_img
+                        else:
+                            left_eye = input_img
+                            right_eye = input_img
+                        # print("type of left_eye = ", type(left_eye))
+                        # print("type of right_eye = ", type(right_eye))
+                    elif "eyes" in ds_results.keys():
+                        if "left_eye" in ds_results["eyes"].keys() and ds_results["eyes"]["left_eye"] is not None:
+                            left_eye = ds_results["eyes"]["left_eye"]
+                            # print("type of left_eye = ", type(left_eye))
+                            left_eye = to_pil_image(left_eye).convert("RGB")
+                            # print("type of left_eye = ", type(left_eye))
+                            left_eye = preprocess_function(left_eye)
+                            # print("type of left_eye = ", type(left_eye))
+                            left_eye = left_eye.unsqueeze(0)
+                        if "right_eye" in ds_results["eyes"].keys() and ds_results["eyes"]["right_eye"] is not None:
+                            right_eye = ds_results["eyes"]["right_eye"]
+                            # print("type of right_eye = ", type(right_eye))
+                            right_eye = to_pil_image(right_eye).convert("RGB")
+                            # print("type of right_eye = ", type(right_eye))
+                            right_eye = preprocess_function(right_eye)
+                            # print("type of right_eye = ", type(right_eye))
+                            right_eye = right_eye.unsqueeze(0)
+                    else:
+                        # print("type of input_img = ", type(input_img))
+                        input_img = preprocess_function(input_img)
+                        input_img = input_img.unsqueeze(0)
+                        if pupil_selection == "left_pupil":
+                            left_eye = input_img
+                        elif pupil_selection == "right_pupil":
+                            right_eye = input_img
+                        else:
+                            left_eye = input_img
+                            right_eye = input_img
+                        # print("type of left_eye = ", type(left_eye))
+                        # print("type of right_eye = ", type(right_eye))
+                    # print("left_eye = ", left_eye.shape)
+                    # print("right_eye = ", right_eye.shape)
+                    if pupil_selection == "-":
+                        selected_eyes = ["left_eye", "right_eye"]
+                    elif pupil_selection == "left_pupil":
+                        selected_eyes = ["left_eye"]
+                    elif pupil_selection == "right_pupil":
+                        selected_eyes = ["right_eye"]
+                    for eye_type in selected_eyes:
+                        if model is None:
+                            model_configs = {
+                                "model_path": root_path + f"/pre_trained_models/{tv_model}/{eye_type}.pt",
+                                "registered_model_name": tv_model,
+                                "num_classes": 1,
+                            }
+                            registered_model_name = model_configs["registered_model_name"]
+                            model = _load_model(model_configs)
+                        if registered_model_name == "ResNet18":
+                            target_layer = model.resnet.layer4[-1].conv2
+                        elif registered_model_name == "ResNet50":
+                            target_layer = model.resnet.layer4[-1].conv3
+                        else:
+                            raise Exception(f"No target layer available for selected model: {registered_model_name}")
+                        if left_eye is not None and eye_type == "left_eye":
+                            input_img = left_eye
+                        elif right_eye is not None and eye_type == "right_eye":
+                            input_img = right_eye
+                        else:
+                            raise Exception("Wrong Data")
+                        if cam_method is not None:
+                            cam_extractor = torchcam_methods.__dict__[cam_method](
+                                model,
+                                target_layer=target_layer,
+                                fc_layer=model.resnet.fc,
+                                input_shape=input_img.shape,
+                            )
+                        # with torch.no_grad():
+                        out = model(input_img)
+                        cols[-1].markdown(
+                            f"<h3>Predicted Pupil Diameter: {out[0].item():.2f} mm</h3>",
+                            unsafe_allow_html=True,
+                        )
+                        # cols[-1].text(f"Predicted Pupil Diameter: {out[0].item():.2f}")
+                        # Retrieve the CAM
+                        act_maps = cam_extractor(0, out)
+                        # Fuse the CAMs if there are several
+                        activation_map = act_maps[0] if len(act_maps) == 1 else cam_extractor.fuse_cams(act_maps)
+                        # Convert input image and activation map to PIL images
+                        input_image_pil = to_pil_image(input_img.squeeze(0))
+                        activation_map_pil = to_pil_image(activation_map, mode="F")
+                        # Create the overlayed CAM result
+                        result = overlay_mask(
+                            input_image_pil,
+                            activation_map_pil,
+                            alpha=0.5,
+                        )
+                        # Create a subplot with 1 row and 2 columns
+                        fig, axs = plt.subplots(1, 2, figsize=(10, 5))
+                        # Display the input image
+                        axs[0].imshow(input_image_pil)
+                        axs[0].axis("off")
+                        axs[0].set_title("Input Image")
+                        # Display the overlayed CAM result
+                        axs[1].imshow(result)
+                        axs[1].axis("off")
+                        axs[1].set_title("Overlayed CAM")
+                        # Display the plot
+                        cols[-1].pyplot(fig)
+                        cols[-1].text(f"eye image size: {input_img.shape[-1]} x {input_img.shape[-2]}")
+if __name__ == "__main__":
+    main()

app_utils.py ADDED Viewed

	@@ -0,0 +1,374 @@

+import base64
+from io import BytesIO
+import os
+import sys
+import cv2
+from matplotlib import pyplot as plt
+import numpy as np
+import streamlit as st
+import torch
+import tempfile
+from PIL import Image
+from torchvision.transforms.functional import to_pil_image
+from torchvision import transforms
+from torchcam.methods import CAM
+from torchcam import methods as torchcam_methods
+from torchcam.utils import overlay_mask
+import os.path as osp
+root_path = osp.abspath(osp.join(__file__, osp.pardir))
+sys.path.append(root_path)
+from preprocessing.dataset_creation import EyeDentityDatasetCreation
+from utils import get_model
+@torch.no_grad()
+def load_model(model_configs, device="cpu"):
+    """Loads the pre-trained model."""
+    model_path = os.path.join(root_path, model_configs["model_path"])
+    model_dict = torch.load(model_path, map_location=device)
+    model = get_model(model_configs=model_configs)
+    model.load_state_dict(model_dict)
+    model = model.to(device).eval()
+    return model
+def extract_frames(video_path):
+    """Extracts frames from a video file."""
+    vidcap = cv2.VideoCapture(video_path)
+    frames = []
+    success, image = vidcap.read()
+    while success:
+        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        frames.append(image_rgb)
+        success, image = vidcap.read()
+    vidcap.release()
+    return frames
+def resize_frame(image, max_width=640, max_height=480):
+    if not isinstance(image, Image.Image):
+        image = Image.fromarray(image)
+    original_size = image.size
+    # Resize the frame similarly to the image resizing logic
+    if original_size[0] == original_size[1] and original_size[0] >= 256:
+        max_size = (256, 256)
+    else:
+        max_size = list(original_size)
+        if original_size[0] >= max_width:
+            max_size[0] = max_width
+        elif original_size[0] < 64:
+            max_size[0] = 64
+        if original_size[1] >= max_height:
+            max_size[1] = max_height
+        elif original_size[1] < 32:
+            max_size[1] = 32
+    image.thumbnail(max_size)
+    # image = image.resize(max_size)
+    return image
+def is_image(file_extension):
+    """Checks if the file is an image."""
+    return file_extension.lower() in ["png", "jpeg", "jpg"]
+def is_video(file_extension):
+    """Checks if the file is a video."""
+    return file_extension.lower() in ["mp4", "avi", "mov", "mkv", "webm"]
+def display_results(input_image, cam_frame, pupil_diameter, cols):
+    """Displays the input image and overlayed CAM result."""
+    fig, axs = plt.subplots(1, 2, figsize=(10, 5))
+    axs[0].imshow(input_image)
+    axs[0].axis("off")
+    axs[0].set_title("Input Image")
+    axs[1].imshow(cam_frame)
+    axs[1].axis("off")
+    axs[1].set_title("Overlayed CAM")
+    cols[-1].pyplot(fig)
+    cols[-1].text(f"Pupil Diameter: {pupil_diameter:.2f} mm")
+def preprocess_image(input_img, max_size=(256, 256)):
+    """Resizes and preprocesses an image."""
+    input_img.thumbnail(max_size)
+    preprocess_steps = [
+        transforms.ToTensor(),
+        transforms.Resize([32, 64], interpolation=transforms.InterpolationMode.BICUBIC, antialias=True),
+    ]
+    return transforms.Compose(preprocess_steps)(input_img).unsqueeze(0)
+def overlay_text_on_frame(frame, text, position=(16, 20)):
+    """Write text on the image frame using OpenCV."""
+    return cv2.putText(frame, text, position, cv2.FONT_HERSHEY_PLAIN, 1, (255, 255, 255), 1, cv2.LINE_AA)
+def process_frames(input_imgs, tv_model, pupil_selection, cam_method):
+    upscale = "-"
+    upscale_method_or_model = "-"
+    if upscale == "-":
+        sr_configs = None
+    else:
+        sr_configs = {
+            "method": upscale_method_or_model,
+            "params": {"upscale": upscale},
+        }
+    config_file = {
+        "sr_configs": sr_configs,
+        "feature_extraction_configs": {
+            "blink_detection": False,
+            "upscale": upscale,
+            "extraction_library": "mediapipe",
+        },
+    }
+    left_pupil_model = None
+    right_pupil_model = None
+    face_frames = []
+    output_frames = {}
+    input_frames = {}
+    predicted_diameters = {}
+    if pupil_selection == "both":
+        selected_eyes = ["left_eye", "right_eye"]
+    elif pupil_selection == "left_pupil":
+        selected_eyes = ["left_eye"]
+    elif pupil_selection == "right_pupil":
+        selected_eyes = ["right_eye"]
+    for eye_type in selected_eyes:
+        model_configs = {
+            "model_path": root_path + f"/pre_trained_models/{tv_model}/{eye_type}.pt",
+            "registered_model_name": tv_model,
+            "num_classes": 1,
+        }
+        if eye_type == "left_eye":
+            left_pupil_model = load_model(model_configs)
+            left_pupil_cam_extractor = None
+            output_frames[eye_type] = []
+            input_frames[eye_type] = []
+            predicted_diameters[eye_type] = []
+        else:
+            right_pupil_model = load_model(model_configs)
+            right_pupil_cam_extractor = None
+            output_frames[eye_type] = []
+            input_frames[eye_type] = []
+            predicted_diameters[eye_type] = []
+    ds_creation = EyeDentityDatasetCreation(
+        feature_extraction_configs=config_file["feature_extraction_configs"],
+        sr_configs=config_file["sr_configs"],
+    )
+    preprocess_steps = [
+        transforms.ToTensor(),
+        transforms.Resize(
+            [32, 64],
+            interpolation=transforms.InterpolationMode.BICUBIC,
+            antialias=True,
+        ),
+    ]
+    preprocess_function = transforms.Compose(preprocess_steps)
+    for input_img in input_imgs:
+        img = np.array(input_img)
+        ds_results = ds_creation(img)
+        left_eye = None
+        right_eye = None
+        blinked = False
+        if ds_results is not None and "face" in ds_results:
+            face_img = to_pil_image(ds_results["face"])
+            has_face = True
+        else:
+            face_img = to_pil_image(np.zeros((256, 256, 3), dtype=np.uint8))
+            has_face = False
+        face_frames.append({"has_face": has_face, "img": face_img})
+        if ds_results is not None and "eyes" in ds_results.keys():
+            blinked = ds_results["eyes"]["blinked"]
+            if not blinked:
+                if "left_eye" in ds_results["eyes"].keys() and ds_results["eyes"]["left_eye"] is not None:
+                    left_eye = ds_results["eyes"]["left_eye"]
+                    left_eye = to_pil_image(left_eye).convert("RGB")
+                    left_eye = preprocess_function(left_eye)
+                    left_eye = left_eye.unsqueeze(0)
+                if "right_eye" in ds_results["eyes"].keys() and ds_results["eyes"]["right_eye"] is not None:
+                    right_eye = ds_results["eyes"]["right_eye"]
+                    right_eye = to_pil_image(right_eye).convert("RGB")
+                    right_eye = preprocess_function(right_eye)
+                    right_eye = right_eye.unsqueeze(0)
+        else:
+            input_img = preprocess_function(input_img)
+            input_img = input_img.unsqueeze(0)
+            if pupil_selection == "left_pupil":
+                left_eye = input_img
+            elif pupil_selection == "right_pupil":
+                right_eye = input_img
+            else:
+                left_eye = input_img
+                right_eye = input_img
+        for eye_type in selected_eyes:
+            if left_eye is not None and eye_type == "left_eye":
+                if left_pupil_cam_extractor is None:
+                    if tv_model == "ResNet18":
+                        target_layer = left_pupil_model.resnet.layer4[-1].conv2
+                    elif tv_model == "ResNet50":
+                        target_layer = left_pupil_model.resnet.layer4[-1].conv3
+                    else:
+                        raise Exception(f"No target layer available for selected model: {tv_model}")
+                    left_pupil_cam_extractor = torchcam_methods.__dict__[cam_method](
+                        left_pupil_model,
+                        target_layer=target_layer,
+                        fc_layer=left_pupil_model.resnet.fc,
+                        input_shape=left_eye.shape,
+                    )
+                output = left_pupil_model(left_eye)
+                predicted_diameter = output[0].item()
+                act_maps = left_pupil_cam_extractor(0, output)
+                activation_map = act_maps[0] if len(act_maps) == 1 else left_pupil_cam_extractor.fuse_cams(act_maps)
+                input_image_pil = to_pil_image(left_eye.squeeze(0))
+            elif right_eye is not None and eye_type == "right_eye":
+                if right_pupil_cam_extractor is None:
+                    if tv_model == "ResNet18":
+                        target_layer = right_pupil_model.resnet.layer4[-1].conv2
+                    elif tv_model == "ResNet50":
+                        target_layer = right_pupil_model.resnet.layer4[-1].conv3
+                    else:
+                        raise Exception(f"No target layer available for selected model: {tv_model}")
+                    right_pupil_cam_extractor = torchcam_methods.__dict__[cam_method](
+                        right_pupil_model,
+                        target_layer=target_layer,
+                        fc_layer=right_pupil_model.resnet.fc,
+                        input_shape=right_eye.shape,
+                    )
+                output = right_pupil_model(right_eye)
+                predicted_diameter = output[0].item()
+                act_maps = right_pupil_cam_extractor(0, output)
+                activation_map = act_maps[0] if len(act_maps) == 1 else right_pupil_cam_extractor.fuse_cams(act_maps)
+                input_image_pil = to_pil_image(right_eye.squeeze(0))
+            if blinked:
+                zeros_img = to_pil_image(np.zeros((256, 256, 3), dtype=np.uint8))
+                input_image_pil = zeros_img
+                result = zeros_img
+                predicted_diameter = 0
+            else:
+                # Create CAM overlay
+                activation_map_pil = to_pil_image(activation_map, mode="F")
+                result = overlay_mask(input_image_pil, activation_map_pil, alpha=0.5)
+            # Add frame and predicted diameter to lists
+            input_frames[eye_type].append(np.array(input_image_pil))
+            output_frames[eye_type].append(np.array(result))
+            predicted_diameters[eye_type].append(predicted_diameter)
+    return input_frames, output_frames, predicted_diameters, face_frames
+# Function to display video with autoplay and loop
+def display_video_with_autoplay(video_col, video_path):
+    video_html = f"""
+        <video width="100%" height="auto" autoplay loop muted>
+            <source src="data:video/mp4;base64,{video_path}" type="video/mp4">
+        </video>
+    """
+    video_col.markdown(video_html, unsafe_allow_html=True)
+def get_codec_and_extension(file_format):
+    """Return codec and file extension based on the format."""
+    if file_format == "mp4":
+        return "H264", ".mp4"
+    elif file_format == "avi":
+        return "MJPG", ".avi"
+    elif file_format == "webm":
+        return "VP80", ".webm"
+    else:
+        return "MJPG", ".avi"
+def process_video(cols, video_frames, tv_model, pupil_selection, output_path, cam_method):
+    resized_frames = []
+    for i, frame in enumerate(video_frames):
+        input_img = resize_frame(frame, max_width=640, max_height=480)
+        # input_img = Image.fromarray(input_img)
+        resized_frames.append(input_img)
+    input_frames, output_frames, predicted_diameters, face_frames = process_frames(
+        resized_frames, tv_model, pupil_selection, cam_method
+    )
+    file_format = output_path.split(".")[-1]
+    codec, extension = get_codec_and_extension(file_format)
+    video_cols = cols[1].columns(len(input_frames.keys()))
+    for i, eye_type in enumerate(input_frames.keys()):
+        in_frames = input_frames[eye_type]
+        height, width, _ = in_frames[0].shape
+        fourcc = cv2.VideoWriter_fourcc(*codec)
+        fps = 10.0
+        out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
+        for frame in in_frames:
+            out.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
+        out.release()
+        with open(output_path, "rb") as video_file:
+            video_bytes = video_file.read()
+            video_base64 = base64.b64encode(video_bytes).decode("utf-8")
+        display_video_with_autoplay(video_cols[i], video_base64)
+        os.remove(output_path)
+    for i, eye_type in enumerate(output_frames.keys()):
+        out_frames = output_frames[eye_type]
+        height, width, _ = out_frames[0].shape
+        fourcc = cv2.VideoWriter_fourcc(*codec)
+        fps = 10.0
+        out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
+        for j, frame in enumerate(out_frames):
+            out.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
+        out.release()
+        with open(output_path, "rb") as video_file:
+            video_bytes = video_file.read()
+            video_base64 = base64.b64encode(video_bytes).decode("utf-8")
+        display_video_with_autoplay(video_cols[i], video_base64)
+        os.remove(output_path)
+    for i, eye_type in enumerate(output_frames.keys()):
+        out_frames = output_frames[eye_type]
+        height, width, _ = out_frames[0].shape
+        fourcc = cv2.VideoWriter_fourcc(*codec)
+        fps = 10.0
+        out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
+        for diameter in predicted_diameters[eye_type]:
+            frame = np.zeros((height, width, 3), dtype=np.uint8)
+            text = f"{diameter:.2f}"
+            frame = overlay_text_on_frame(frame, text)
+            out.write(frame)
+        out.release()
+        with open(output_path, "rb") as video_file:
+            video_bytes = video_file.read()
+            video_base64 = base64.b64encode(video_bytes).decode("utf-8")
+        display_video_with_autoplay(video_cols[i], video_base64)
+        os.remove(output_path)
+    return predicted_diameters

config.yml CHANGED Viewed

@@ -2,8 +2,9 @@ seed: 42
 feature_extraction_configs:
   blink_detection: true
   extraction_library: "mediapipe"
-  show_features: ['full_imgs', 'faces', 'eyes', 'blinks', 'iris']
 model_configs:
   models_path: "pre_trained_models"

 feature_extraction_configs:
   blink_detection: true
+  upscale: 1
   extraction_library: "mediapipe"
+  show_features: ['faces', 'eyes', 'blinks']
 model_configs:
   models_path: "pre_trained_models"

feature_extraction/extractor_mediapipe.py CHANGED Viewed

@@ -18,9 +18,7 @@ class ExtractorMediaPipe:
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         # ========== Face Extraction ==========
-        self.face_detector = mp.solutions.face_detection.FaceDetection(
-            model_selection=0, min_detection_confidence=0.5
-        )
         self.face_mesh = mp.solutions.face_mesh.FaceMesh(
             max_num_faces=1,
             static_image_mode=True,
@@ -169,19 +167,11 @@ class ExtractorMediaPipe:
         left_eye_landmark3 = landmarks[left_indices[12]]
         left_eye_landmark4 = landmarks[left_indices[4]]
-        right_eye_horizontal_distance = self.euclideanDistance(
-            right_eye_landmark1, right_eye_landmark2
-        )
-        right_eye_vertical_distance = self.euclideanDistance(
-            right_eye_landmark3, right_eye_landmark4
-        )
-        left_eye_vertical_distance = self.euclideanDistance(
-            left_eye_landmark3, left_eye_landmark4
-        )
-        left_eye_horizontal_distance = self.euclideanDistance(
-            left_eye_landmark1, left_eye_landmark2
-        )
         right_eye_ratio = right_eye_vertical_distance / right_eye_horizontal_distance
         left_eye_ratio = left_eye_vertical_distance / left_eye_horizontal_distance
@@ -192,10 +182,7 @@ class ExtractorMediaPipe:
     def extract_eyes_regions(self, image, landmarks, eye_indices):
         h, w, _ = image.shape
-        points = [
-            (int(landmarks[idx].x * w), int(landmarks[idx].y * h))
-            for idx in eye_indices
-        ]
         x_min = min([p[0] for p in points])
         x_max = max([p[0] for p in points])
@@ -261,21 +248,14 @@ class ExtractorMediaPipe:
         if blink_detection:
             mesh_coordinates = self.landmarksDetection(image, results, False)
-            eyes_ratio = self.blinkRatio(
-                mesh_coordinates, self.RIGHT_EYE, self.LEFT_EYE
-            )
-            if (
-                eyes_ratio > self.blink_lower_thresh
-                and eyes_ratio <= self.blink_upper_thresh
-            ):
                 # print(
                 #     "I think person blinked. eyes_ratio = ",
                 #     eyes_ratio,
                 #     "Confirming with ViT model...",
                 # )
-                blinked = self.blink_detection_model(
-                    left_eye=left_eye, right_eye=right_eye
-                )
                 # if blinked:
                 #     print("Yes, person blinked. Confirmed by model")
                 # else:
@@ -298,9 +278,7 @@ class ExtractorMediaPipe:
         iris_img_blur = cv2.GaussianBlur(iris_img_gray, (5, 5), 0)
         # Perform adaptive thresholding
-        _, iris_img_mask = cv2.threshold(
-            iris_img_blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
-        )
         # Invert the mask
         segmented_mask = cv2.bitwise_not(iris_img_mask)
@@ -335,9 +313,7 @@ class ExtractorMediaPipe:
         cropped_left_iris = image[l_y1:l_y2, l_x1:l_x2]
-        left_iris_segmented_data = self.segment_iris(
-            cv2.cvtColor(cropped_left_iris, cv2.COLOR_BGR2RGB)
-        )
         # Crop the right iris to be exactly 16*upscaled x 16*upscaled
         r_x1 = max(int(r_cx) - (8 * self.upscale), 0)
@@ -347,9 +323,7 @@ class ExtractorMediaPipe:
         cropped_right_iris = image[r_y1:r_y2, r_x1:r_x2]
-        right_iris_segmented_data = self.segment_iris(
-            cv2.cvtColor(cropped_right_iris, cv2.COLOR_BGR2RGB)
-        )
         return {
             "left_iris": {

         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         # ========== Face Extraction ==========
+        self.face_detector = mp.solutions.face_detection.FaceDetection(model_selection=0, min_detection_confidence=0.5)
         self.face_mesh = mp.solutions.face_mesh.FaceMesh(
             max_num_faces=1,
             static_image_mode=True,
         left_eye_landmark3 = landmarks[left_indices[12]]
         left_eye_landmark4 = landmarks[left_indices[4]]
+        right_eye_horizontal_distance = self.euclideanDistance(right_eye_landmark1, right_eye_landmark2)
+        right_eye_vertical_distance = self.euclideanDistance(right_eye_landmark3, right_eye_landmark4)
+        left_eye_vertical_distance = self.euclideanDistance(left_eye_landmark3, left_eye_landmark4)
+        left_eye_horizontal_distance = self.euclideanDistance(left_eye_landmark1, left_eye_landmark2)
         right_eye_ratio = right_eye_vertical_distance / right_eye_horizontal_distance
         left_eye_ratio = left_eye_vertical_distance / left_eye_horizontal_distance
     def extract_eyes_regions(self, image, landmarks, eye_indices):
         h, w, _ = image.shape
+        points = [(int(landmarks[idx].x * w), int(landmarks[idx].y * h)) for idx in eye_indices]
         x_min = min([p[0] for p in points])
         x_max = max([p[0] for p in points])
         if blink_detection:
             mesh_coordinates = self.landmarksDetection(image, results, False)
+            eyes_ratio = self.blinkRatio(mesh_coordinates, self.RIGHT_EYE, self.LEFT_EYE)
+            if eyes_ratio > self.blink_lower_thresh and eyes_ratio <= self.blink_upper_thresh:
                 # print(
                 #     "I think person blinked. eyes_ratio = ",
                 #     eyes_ratio,
                 #     "Confirming with ViT model...",
                 # )
+                blinked = self.blink_detection_model(left_eye=left_eye, right_eye=right_eye)
                 # if blinked:
                 #     print("Yes, person blinked. Confirmed by model")
                 # else:
         iris_img_blur = cv2.GaussianBlur(iris_img_gray, (5, 5), 0)
         # Perform adaptive thresholding
+        _, iris_img_mask = cv2.threshold(iris_img_blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
         # Invert the mask
         segmented_mask = cv2.bitwise_not(iris_img_mask)
         cropped_left_iris = image[l_y1:l_y2, l_x1:l_x2]
+        left_iris_segmented_data = self.segment_iris(cv2.cvtColor(cropped_left_iris, cv2.COLOR_BGR2RGB))
         # Crop the right iris to be exactly 16*upscaled x 16*upscaled
         r_x1 = max(int(r_cx) - (8 * self.upscale), 0)
         cropped_right_iris = image[r_y1:r_y2, r_x1:r_x2]
+        right_iris_segmented_data = self.segment_iris(cv2.cvtColor(cropped_right_iris, cv2.COLOR_BGR2RGB))
         return {
             "left_iris": {

feature_extraction/features_extractor.py CHANGED Viewed

@@ -14,9 +14,7 @@ warnings.filterwarnings("ignore")
 class FeaturesExtractor:
-    def __init__(
-        self, extraction_library="mediapipe", blink_detection=False, upscale=1
-    ):
         self.upscale = upscale
         self.blink_detection = blink_detection
         self.extraction_library = extraction_library

 class FeaturesExtractor:
+    def __init__(self, extraction_library="mediapipe", blink_detection=False, upscale=1):
         self.upscale = upscale
         self.blink_detection = blink_detection
         self.extraction_library = extraction_library

image.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import cv2
+import numpy as np
+# Load the original face image
+face_image = cv2.imread("path_to_face_image.jpg")
+# Suppose CAM_left and CAM_right are the CAM results for the eyes (each 32x64)
+CAM_left = cv2.imread("path_to_CAM_left.jpg")  # or generated by your model
+CAM_right = cv2.imread("path_to_CAM_right.jpg")  # or generated by your model
+# Example bounding boxes for the left and right eye
+left_eye_bbox = (x_left, y_left, width_left, height_left)
+right_eye_bbox = (x_right, y_right, width_right, height_right)
+# Resize CAM images if needed (they should be 32x64, but resize to match bbox size)
+CAM_left_resized = cv2.resize(CAM_left, (width_left, height_left))
+CAM_right_resized = cv2.resize(CAM_right, (width_right, height_right))
+# Create a copy of the face image to overlay the CAM results
+face_with_CAM = face_image.copy()
+# Overlay left eye CAM
+face_with_CAM[y_left : y_left + height_left, x_left : x_left + width_left] = CAM_left_resized
+# Overlay right eye CAM
+face_with_CAM[y_right : y_right + height_right, x_right : x_right + width_right] = CAM_right_resized
+# Save or display the result
+cv2.imwrite("face_with_CAM_overlay.jpg", face_with_CAM)
+cv2.imshow("Face with CAM Overlay", face_with_CAM)
+cv2.waitKey(0)
+cv2.destroyAllWindows()

video.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import cv2
+import torch
+# Load the video
+video_path = "path_to_video.mp4"
+cap = cv2.VideoCapture(video_path)
+# Video properties
+frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+fps = cap.get(cv2.CAP_PROP_FPS)
+# Create a VideoWriter object for the output video
+out = cv2.VideoWriter("output_with_CAM.mp4", cv2.VideoWriter_fourcc(*"mp4v"), fps, (frame_width, frame_height))
+# Process each frame
+while True:
+    ret, frame = cap.read()
+    if not ret:
+        break  # End of the video
+    # Detect landmarks for left and right eye bounding boxes (example)
+    left_eye_bbox = (x_left, y_left, width_left, height_left)
+    right_eye_bbox = (x_right, y_right, width_right, height_right)
+    # Crop the eyes
+    left_eye = frame[y_left : y_left + height_left, x_left : x_left + width_left]
+    right_eye = frame[y_right : y_right + height_right, x_right : x_right + width_right]
+    # Generate CAMs for left and right eyes
+    CAM_left = generate_CAM(left_eye)  # Use your model here
+    CAM_right = generate_CAM(right_eye)  # Use your model here
+    # Resize CAMs if necessary
+    CAM_left_resized = cv2.resize(CAM_left, (width_left, height_left))
+    CAM_right_resized = cv2.resize(CAM_right, (width_right, height_right))
+    # Overlay the CAMs onto the original frame
+    frame[y_left : y_left + height_left, x_left : x_left + width_left] = CAM_left_resized
+    frame[y_right : y_right + height_right, x_right : x_right + width_right] = CAM_right_resized
+    # Write the processed frame to the output video
+    out.write(frame)
+# Release resources
+cap.release()
+out.release()
+cv2.destroyAllWindows()