Spaces:

hysts
/

ViTPose-transformers

Running on Zero

App Files Files Community

hysts HF Staff

onuralpszr commited on Jul 17

Commit

c084355

verified ·

1 Parent(s): 55b540a

feat: 🚀 new supervision vitpose support and annotators improvement (#3)

Browse files

- -feat: 🚀 new supervision vitpose support and annotators improvement and doccs and gradio UI updates for more functionally (acb84529b192c3ec3d63c9abc055a1a8337dd91c)

Co-authored-by: Onuralp SEZER <onuralpszr@users.noreply.huggingface.co>

Files changed (3) hide show

app.py +122 -39
pyproject.toml +1 -1
requirements.txt +2 -8

app.py CHANGED Viewed

@@ -13,10 +13,62 @@ import torch
 import tqdm
 from transformers import AutoProcessor, RTDetrForObjectDetection, VitPoseForPoseEstimation
-DESCRIPTION = "# ViTPose"
 MAX_NUM_FRAMES = 300
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 person_detector_name = "PekingU/rtdetr_r50vd_coco_o365"
@@ -30,11 +82,19 @@ pose_model = VitPoseForPoseEstimation.from_pretrained(pose_model_name, device_ma
 @spaces.GPU(duration=5)
 @torch.inference_mode()
-def detect_pose_image(image: PIL.Image.Image) -> tuple[PIL.Image.Image, list[dict]]:
     """Detects persons and estimates their poses in a single image.
     Args:
         image (PIL.Image.Image): Input image in which to detect persons and estimate poses.
     Returns:
         tuple[PIL.Image.Image, list[dict]]:
@@ -44,20 +104,14 @@ def detect_pose_image(image: PIL.Image.Image) -> tuple[PIL.Image.Image, list[dic
     inputs = person_image_processor(images=image, return_tensors="pt").to(device)
     outputs = person_model(**inputs)
     results = person_image_processor.post_process_object_detection(
-        outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.3
     )
     result = results[0]  # take first image results
-    # Human label refers 0 index in COCO dataset
-    person_boxes_xyxy = result["boxes"][result["labels"] == 0]
-    person_boxes_xyxy = person_boxes_xyxy.cpu().numpy()
-    # Convert boxes from VOC (x1, y1, x2, y2) to COCO (x1, y1, w, h) format
-    person_boxes = person_boxes_xyxy.copy()
-    person_boxes[:, 2] = person_boxes[:, 2] - person_boxes[:, 0]
-    person_boxes[:, 3] = person_boxes[:, 3] - person_boxes[:, 1]
-    inputs = pose_image_processor(image, boxes=[person_boxes], return_tensors="pt").to(device)
     # for vitpose-plus-base checkpoint we should additionally provide dataset_index
     # to specify which MOE experts to use for inference
@@ -68,11 +122,12 @@ def detect_pose_image(image: PIL.Image.Image) -> tuple[PIL.Image.Image, list[dic
     outputs = pose_model(**inputs)
-    pose_results = pose_image_processor.post_process_pose_estimation(outputs, boxes=[person_boxes])
     image_pose_result = pose_results[0]  # results for first image
     # make results more human-readable
     human_readable_results = []
     for i, person_pose in enumerate(image_pose_result):
         data = {
             "person_id": i,
@@ -83,43 +138,55 @@ def detect_pose_image(image: PIL.Image.Image) -> tuple[PIL.Image.Image, list[dic
             person_pose["keypoints"], person_pose["labels"], person_pose["scores"], strict=True
         ):
             keypoint_name = pose_model.config.id2label[label.item()]
             x, y = keypoint
             data["keypoints"].append({"name": keypoint_name, "x": x.item(), "y": y.item(), "score": score.item()})
         human_readable_results.append(data)
-    # preprocess to torch tensor of shape (n_objects, n_keypoints, 2)
-    xy = [pose_result["keypoints"] for pose_result in image_pose_result]
-    xy = torch.stack(xy).cpu().numpy()
-    scores = [pose_result["scores"] for pose_result in image_pose_result]
-    scores = torch.stack(scores).cpu().numpy()
-    keypoints = sv.KeyPoints(xy=xy, confidence=scores)
-    detections = sv.Detections(xyxy=person_boxes_xyxy)
-    edge_annotator = sv.EdgeAnnotator(color=sv.Color.GREEN, thickness=1)
-    vertex_annotator = sv.VertexAnnotator(color=sv.Color.RED, radius=2)
-    bounding_box_annotator = sv.BoxAnnotator(color=sv.Color.WHITE, color_lookup=sv.ColorLookup.INDEX, thickness=1)
-    annotated_frame = image.copy()
-    # annotate bounding boxes
-    annotated_frame = bounding_box_annotator.annotate(scene=image.copy(), detections=detections)
-    # annotate edges and vertices
-    annotated_frame = edge_annotator.annotate(scene=annotated_frame, key_points=keypoints)
-    return vertex_annotator.annotate(scene=annotated_frame, key_points=keypoints), human_readable_results
-@spaces.GPU(duration=90)
 def detect_pose_video(
     video_path: str,
     progress: gr.Progress = gr.Progress(track_tqdm=True),  # noqa: ARG001, B008
 ) -> str:
     """Detects persons and estimates their poses for each frame in a video, saving the annotated video.
     Args:
         video_path (str): Path to the input video file.
         progress (gr.Progress, optional): Gradio progress tracker. Defaults to gr.Progress(track_tqdm=True).
     Returns:
@@ -140,7 +207,12 @@ def detect_pose_video(
             if not ok:
                 break
             rgb_frame = frame[:, :, ::-1]
-            annotated_frame, _ = detect_pose_image(PIL.Image.fromarray(rgb_frame))
             writer.write(np.asarray(annotated_frame)[:, :, ::-1])
         writer.release()
     cap.release()
@@ -150,6 +222,17 @@ def detect_pose_video(
 with gr.Blocks(css_paths="style.css") as demo:
     gr.Markdown(DESCRIPTION)
     with gr.Tabs():
         with gr.Tab("Image"):
             with gr.Row():
@@ -160,15 +243,15 @@ with gr.Blocks(css_paths="style.css") as demo:
                     output_image = gr.Image(label="Output Image")
                     output_json = gr.JSON(label="Output JSON")
             gr.Examples(
-                examples=sorted(pathlib.Path("images").glob("*.jpg")),
-                inputs=input_image,
                 outputs=[output_image, output_json],
                 fn=detect_pose_image,
             )
             run_button_image.click(
                 fn=detect_pose_image,
-                inputs=input_image,
                 outputs=[output_image, output_json],
             )
@@ -183,15 +266,15 @@ with gr.Blocks(css_paths="style.css") as demo:
                     output_video = gr.Video(label="Output Video")
             gr.Examples(
-                examples=sorted(pathlib.Path("videos").glob("*.mp4")),
-                inputs=input_video,
                 outputs=output_video,
                 fn=detect_pose_video,
                 cache_examples=False,
             )
             run_button_video.click(
                 fn=detect_pose_video,
-                inputs=input_video,
                 outputs=output_video,
             )

 import tqdm
 from transformers import AutoProcessor, RTDetrForObjectDetection, VitPoseForPoseEstimation
+DESCRIPTION = """
+# ViTPose
+<div style="display: flex; gap: 10px;">
+    <a href="https://huggingface.co/docs/transformers/en/model_doc/vitpose">
+        <img src="https://img.shields.io/badge/Huggingface-FFD21E?style=flat&logo=Huggingface&logoColor=black" alt="Huggingface">
+    </a>
+    <a href="https://arxiv.org/abs/2204.12484">
+        <img src="https://img.shields.io/badge/Arvix-B31B1B?style=flat&logo=arXiv&logoColor=white" alt="Paper">
+    </a>
+    <a href="https://github.com/ViTAE-Transformer/ViTPose">
+        <img src="https://img.shields.io/badge/Github-100000?style=flat&logo=github&logoColor=white" alt="Github">
+    </a>
+</div>
+ViTPose is a state-of-the-art human pose estimation model based on Vision Transformers (ViT). It employs a standard, non-hierarchical ViT backbone and a simple decoder head to predict keypoint heatmaps from images. Despite its simplicity, ViTPose achieves top results on the MS COCO Keypoint Detection benchmark.
+ViTPose++ further improves performance with a mixture-of-experts (MoE) module and extensive pre-training. The model is scalable, flexible, and demonstrates strong transferability across pose estimation tasks.
+**Key features:**
+- PyTorch implementation
+- Scalable model size (100M to 1B parameters)
+- Flexible training and inference
+- State-of-the-art accuracy on challenging benchmarks
+"""
+COLORS = [
+    "#A351FB",
+    "#FF4040",
+    "#FFA1A0",
+    "#FF7633",
+    "#FFB633",
+    "#D1D435",
+    "#4CFB12",
+    "#94CF1A",
+    "#40DE8A",
+    "#1B9640",
+    "#00D6C1",
+    "#2E9CAA",
+    "#00C4FF",
+    "#364797",
+    "#6675FF",
+    "#0019EF",
+    "#863AFF",
+]
+COLORS = [sv.Color.from_hex(color_hex=c) for c in COLORS]
 MAX_NUM_FRAMES = 300
+keypoint_score = 0.3
+enable_labels_annotator = True
+enable_vertices_annotator = True
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 person_detector_name = "PekingU/rtdetr_r50vd_coco_o365"
 @spaces.GPU(duration=5)
 @torch.inference_mode()
+def detect_pose_image(
+    image: PIL.Image.Image,
+    threshold: float = 0.3,
+    enable_labels_annotator: bool = True,
+    enable_vertices_annotator: bool = True,
+) -> tuple[PIL.Image.Image, list[dict]]:
     """Detects persons and estimates their poses in a single image.
     Args:
         image (PIL.Image.Image): Input image in which to detect persons and estimate poses.
+        threshold (Float): Confidence threshold for pose keypoints.
+        enable_labels_annotator (bool): Whether to enable annotating labels for pose keypoints.
+        enable_vertices_annotator (bool): Whether to enable annotating vertices for pose keypoints
     Returns:
         tuple[PIL.Image.Image, list[dict]]:
     inputs = person_image_processor(images=image, return_tensors="pt").to(device)
     outputs = person_model(**inputs)
     results = person_image_processor.post_process_object_detection(
+        outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=threshold
     )
     result = results[0]  # take first image results
+    detections = sv.Detections.from_transformers(result)
+    person_detections_xywh = sv.xyxy_to_xywh(detections[detections.class_id == 0].xyxy)
+    inputs = pose_image_processor(image, boxes=[person_detections_xywh], return_tensors="pt").to(device)
     # for vitpose-plus-base checkpoint we should additionally provide dataset_index
     # to specify which MOE experts to use for inference
     outputs = pose_model(**inputs)
+    pose_results = pose_image_processor.post_process_pose_estimation(outputs, boxes=[person_detections_xywh])
     image_pose_result = pose_results[0]  # results for first image
     # make results more human-readable
     human_readable_results = []
+    person_pose_labels = []
     for i, person_pose in enumerate(image_pose_result):
         data = {
             "person_id": i,
             person_pose["keypoints"], person_pose["labels"], person_pose["scores"], strict=True
         ):
             keypoint_name = pose_model.config.id2label[label.item()]
+            person_pose_labels.append(keypoint_name)
             x, y = keypoint
             data["keypoints"].append({"name": keypoint_name, "x": x.item(), "y": y.item(), "score": score.item()})
         human_readable_results.append(data)
+    line_thickness = sv.calculate_optimal_line_thickness(resolution_wh=(image.width, image.height))
+    text_scale = sv.calculate_optimal_text_scale(resolution_wh=(image.width, image.height))
+    edge_annotator = sv.EdgeAnnotator(color=sv.Color.WHITE, thickness=line_thickness)
+    vertex_annotator = sv.VertexAnnotator(color=sv.Color.BLUE, radius=3)
+    box_annotator = sv.BoxAnnotator(color=sv.Color.WHITE, color_lookup=sv.ColorLookup.INDEX, thickness=3)
+    vertex_label_annotator = sv.VertexLabelAnnotator(
+        color=COLORS, smart_position=True, border_radius=3, text_thickness=2, text_scale=text_scale
+    )
+    annotated_frame = box_annotator.annotate(scene=image.copy(), detections=detections)
+    for _, person_pose in enumerate(image_pose_result):
+        person_keypoints = sv.KeyPoints.from_transformers([person_pose])
+        person_labels = [pose_model.config.id2label[label.item()] for label in person_pose["labels"]]
+        # annotate edges and vertices for this person
+        annotated_frame = edge_annotator.annotate(scene=annotated_frame, key_points=person_keypoints)
+        # annotate labels for this person
+        if enable_labels_annotator:
+            annotated_frame = vertex_label_annotator.annotate(
+                scene=np.array(annotated_frame), key_points=person_keypoints, labels=person_labels
+            )
+        # annotate vertices for this person
+        if enable_vertices_annotator:
+            annotated_frame = vertex_annotator.annotate(scene=annotated_frame, key_points=person_keypoints)
+    return annotated_frame, human_readable_results
 def detect_pose_video(
     video_path: str,
+    threshold: float,
+    enable_labels_annotator: bool = True,
+    enable_vertices_annotator: bool = True,
     progress: gr.Progress = gr.Progress(track_tqdm=True),  # noqa: ARG001, B008
 ) -> str:
     """Detects persons and estimates their poses for each frame in a video, saving the annotated video.
     Args:
         video_path (str): Path to the input video file.
+        threshold (Float): Confidence threshold for pose keypoints.
+        enable_labels_annotator (bool): Whether to enable annotating labels for pose keypoints.
+        enable_vertices_annotator (bool): Whether to enable annotating vertices for pose keypoints.
         progress (gr.Progress, optional): Gradio progress tracker. Defaults to gr.Progress(track_tqdm=True).
     Returns:
             if not ok:
                 break
             rgb_frame = frame[:, :, ::-1]
+            annotated_frame, _ = detect_pose_image(
+                PIL.Image.fromarray(rgb_frame),
+                threshold=threshold,
+                enable_labels_annotator=enable_labels_annotator,
+                enable_vertices_annotator=enable_vertices_annotator,
+            )
             writer.write(np.asarray(annotated_frame)[:, :, ::-1])
         writer.release()
     cap.release()
 with gr.Blocks(css_paths="style.css") as demo:
     gr.Markdown(DESCRIPTION)
+    keypoint_score = gr.Slider(
+        minimum=0.0,
+        maximum=1.0,
+        value=0.6,
+        step=0.01,
+        info="Adjust the confidence threshold for keypoint detection.",
+        label="Keypoint Score Threshold",
+    )
+    enable_labels_annotator = gr.Checkbox(interactive=True, value=True, label="Enable Labels")
+    enable_vertices_annotator = gr.Checkbox(interactive=True, value=True, label="Enable Vertices")
     with gr.Tabs():
         with gr.Tab("Image"):
             with gr.Row():
                     output_image = gr.Image(label="Output Image")
                     output_json = gr.JSON(label="Output JSON")
             gr.Examples(
+                examples=[[str(img), 0.5, True, True] for img in sorted(pathlib.Path("images").glob("*.jpg"))],
+                inputs=[input_image, keypoint_score, enable_labels_annotator, enable_vertices_annotator],
                 outputs=[output_image, output_json],
                 fn=detect_pose_image,
             )
             run_button_image.click(
                 fn=detect_pose_image,
+                inputs=[input_image, keypoint_score, enable_labels_annotator, enable_vertices_annotator],
                 outputs=[output_image, output_json],
             )
                     output_video = gr.Video(label="Output Video")
             gr.Examples(
+                examples=[[str(video), 0.5, True, True] for video in sorted(pathlib.Path("videos").glob("*.mp4"))],
+                inputs=[input_video, keypoint_score, enable_labels_annotator, enable_vertices_annotator],
                 outputs=output_video,
                 fn=detect_pose_video,
                 cache_examples=False,
             )
             run_button_video.click(
                 fn=detect_pose_video,
+                inputs=[input_video, keypoint_score, enable_labels_annotator, enable_vertices_annotator],
                 outputs=output_video,
             )

pyproject.toml CHANGED Viewed

@@ -10,7 +10,7 @@ dependencies = [
     "hf-transfer>=0.1.9",
     "setuptools>=80.9.0",
     "spaces>=0.37.1",
-    "supervision>=0.25.1",
     "torch==2.5.1",
     "transformers>=4.53.0",
 ]

     "hf-transfer>=0.1.9",
     "setuptools>=80.9.0",
     "spaces>=0.37.1",
+    "supervision>=0.26.0",
     "torch==2.5.1",
     "transformers>=4.53.0",
 ]

requirements.txt CHANGED Viewed

@@ -25,15 +25,11 @@ click==8.1.8
     #   typer
     #   uvicorn
 contourpy==1.3.1
-    # via
-    #   matplotlib
-    #   supervision
 cycler==0.12.1
     # via matplotlib
 defusedxml==0.7.1
     # via supervision
-exceptiongroup==1.2.2
-    # via anyio
 fastapi==0.115.7
     # via gradio
 ffmpy==0.5.0
@@ -254,7 +250,7 @@ starlette==0.45.3
     #   fastapi
     #   gradio
     #   mcp
-supervision==0.25.1
     # via vitpose-transformers (pyproject.toml)
 sympy==1.13.1
     # via torch
@@ -286,12 +282,10 @@ typing-extensions==4.12.2
     #   huggingface-hub
     #   pydantic
     #   pydantic-core
-    #   rich
     #   spaces
     #   torch
     #   typer
     #   typing-inspection
-    #   uvicorn
 typing-inspection==0.4.1
     # via
     #   pydantic

     #   typer
     #   uvicorn
 contourpy==1.3.1
+    # via matplotlib
 cycler==0.12.1
     # via matplotlib
 defusedxml==0.7.1
     # via supervision
 fastapi==0.115.7
     # via gradio
 ffmpy==0.5.0
     #   fastapi
     #   gradio
     #   mcp
+supervision==0.26.0
     # via vitpose-transformers (pyproject.toml)
 sympy==1.13.1
     # via torch
     #   huggingface-hub
     #   pydantic
     #   pydantic-core
     #   spaces
     #   torch
     #   typer
     #   typing-inspection
 typing-inspection==0.4.1
     # via
     #   pydantic