Wanli
commited on
Commit
·
50fc340
1
Parent(s):
5de3983
Update handpose estimation model from MediaPipe (2023feb) (#133)
Browse files* update handpose model
* update quantize model
* fix quantize path
* update readme of quantization and benchmark result
* fix document
- README.md +2 -2
- benchmark/config/handpose_estimation_mediapipe.yaml +1 -1
- models/handpose_estimation_mediapipe/README.md +9 -4
- models/handpose_estimation_mediapipe/demo.py +101 -39
- models/handpose_estimation_mediapipe/mp_handpose.py +17 -7
- models/palm_detection_mediapipe/README.md +3 -0
- tools/quantize/README.md +1 -1
- tools/quantize/quantize-ort.py +7 -2
- tools/quantize/transform.py +70 -1
README.md
CHANGED
|
@@ -35,7 +35,7 @@ Guidelines:
|
|
| 35 |
| [DaSiamRPN](./models/object_tracking_dasiamrpn) | Object Tracking | 1280x720 | 36.15 | 705.48 | 76.82 | --- | --- |
|
| 36 |
| [YoutuReID](./models/person_reid_youtureid) | Person Re-Identification | 128x256 | 35.81 | 521.98 | 90.07 | 44.61 | --- |
|
| 37 |
| [MP-PalmDet](./models/palm_detection_mediapipe) | Palm Detection | 192x192 | 11.09 | 63.79 | 83.20 | 33.81 | --- |
|
| 38 |
-
| [MP-HandPose](./models/handpose_estimation_mediapipe) | Hand Pose Estimation |
|
| 39 |
|
| 40 |
\*: Models are quantized in per-channel mode, which run slower than per-tensor quantized models on NPU.
|
| 41 |
|
|
@@ -91,7 +91,7 @@ Some examples are listed below. You can find more in the directory of each model
|
|
| 91 |
|
| 92 |
### Hand Pose Estimation with [MP-HandPose](models/handpose_estimation_mediapipe/)
|
| 93 |
|
| 94 |
-

|
| 97 |
|
|
|
|
| 35 |
| [DaSiamRPN](./models/object_tracking_dasiamrpn) | Object Tracking | 1280x720 | 36.15 | 705.48 | 76.82 | --- | --- |
|
| 36 |
| [YoutuReID](./models/person_reid_youtureid) | Person Re-Identification | 128x256 | 35.81 | 521.98 | 90.07 | 44.61 | --- |
|
| 37 |
| [MP-PalmDet](./models/palm_detection_mediapipe) | Palm Detection | 192x192 | 11.09 | 63.79 | 83.20 | 33.81 | --- |
|
| 38 |
+
| [MP-HandPose](./models/handpose_estimation_mediapipe) | Hand Pose Estimation | 224x224 | 4.28 | 36.19 | 40.10 | 19.47 | --- |
|
| 39 |
|
| 40 |
\*: Models are quantized in per-channel mode, which run slower than per-tensor quantized models on NPU.
|
| 41 |
|
|
|
|
| 91 |
|
| 92 |
### Hand Pose Estimation with [MP-HandPose](models/handpose_estimation_mediapipe/)
|
| 93 |
|
| 94 |
+

|
| 95 |
|
| 96 |
### QR Code Detection and Parsing with [WeChatQRCode](./models/qrcode_wechatqrcode/)
|
| 97 |
|
benchmark/config/handpose_estimation_mediapipe.yaml
CHANGED
|
@@ -5,7 +5,7 @@ Benchmark:
|
|
| 5 |
path: "data/palm_detection_20230125"
|
| 6 |
files: ["palm1.jpg", "palm2.jpg", "palm3.jpg"]
|
| 7 |
sizes: # [[w1, h1], ...], Omit to run at original scale
|
| 8 |
-
- [
|
| 9 |
metric:
|
| 10 |
warmup: 30
|
| 11 |
repeat: 10
|
|
|
|
| 5 |
path: "data/palm_detection_20230125"
|
| 6 |
files: ["palm1.jpg", "palm2.jpg", "palm3.jpg"]
|
| 7 |
sizes: # [[w1, h1], ...], Omit to run at original scale
|
| 8 |
+
- [224, 224]
|
| 9 |
metric:
|
| 10 |
warmup: 30
|
| 11 |
repeat: 10
|
models/handpose_estimation_mediapipe/README.md
CHANGED
|
@@ -4,11 +4,14 @@ This model estimates 21 hand keypoints per detected hand from [palm detector](..
|
|
| 4 |
|
| 5 |

|
| 6 |
|
| 7 |
-
This model is converted from
|
| 8 |
-
-
|
| 9 |
-
- tf_saved_model to ONNX: https://github.com/onnx/tensorflow-onnx
|
| 10 |
- simplified by [onnx-simplifier](https://github.com/daquexian/onnx-simplifier)
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
## Demo
|
| 13 |
|
| 14 |
Run the following commands to try the demo:
|
|
@@ -21,7 +24,7 @@ python demo.py -i /path/to/image
|
|
| 21 |
|
| 22 |
### Example outputs
|
| 23 |
|
| 24 |
-
.
|
|
| 30 |
## Reference
|
| 31 |
|
| 32 |
- MediaPipe Handpose: https://github.com/tensorflow/tfjs-models/tree/master/handpose
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |

|
| 6 |
|
| 7 |
+
This model is converted from TFlite to ONNX using following tools:
|
| 8 |
+
- TFLite model to ONNX: https://github.com/onnx/tensorflow-onnx
|
|
|
|
| 9 |
- simplified by [onnx-simplifier](https://github.com/daquexian/onnx-simplifier)
|
| 10 |
|
| 11 |
+
**Note**:
|
| 12 |
+
- The int8-quantized model may produce invalid results due to a significant drop of accuracy.
|
| 13 |
+
- Visit https://google.github.io/mediapipe/solutions/models.html#hands for models of larger scale.
|
| 14 |
+
|
| 15 |
## Demo
|
| 16 |
|
| 17 |
Run the following commands to try the demo:
|
|
|
|
| 24 |
|
| 25 |
### Example outputs
|
| 26 |
|
| 27 |
+

|
| 28 |
|
| 29 |
## License
|
| 30 |
|
|
|
|
| 33 |
## Reference
|
| 34 |
|
| 35 |
- MediaPipe Handpose: https://github.com/tensorflow/tfjs-models/tree/master/handpose
|
| 36 |
+
- MediaPipe hands model and model card: https://google.github.io/mediapipe/solutions/models.html#hands
|
| 37 |
+
- Int8 model quantized with rgb evaluation set of FreiHAND: https://lmb.informatik.uni-freiburg.de/resources/datasets/FreihandDataset.en.html
|
models/handpose_estimation_mediapipe/demo.py
CHANGED
|
@@ -31,69 +31,126 @@ except:
|
|
| 31 |
|
| 32 |
parser = argparse.ArgumentParser(description='Hand Pose Estimation from MediaPipe')
|
| 33 |
parser.add_argument('--input', '-i', type=str, help='Path to the input image. Omit for using default camera.')
|
| 34 |
-
parser.add_argument('--model', '-m', type=str, default='./
|
| 35 |
parser.add_argument('--backend', '-b', type=int, default=backends[0], help=help_msg_backends.format(*backends))
|
| 36 |
parser.add_argument('--target', '-t', type=int, default=targets[0], help=help_msg_targets.format(*targets))
|
| 37 |
-
parser.add_argument('--conf_threshold', type=float, default=0.
|
| 38 |
parser.add_argument('--save', '-s', type=str, default=False, help='Set true to save results. This flag is invalid when using camera.')
|
| 39 |
parser.add_argument('--vis', '-v', type=str2bool, default=True, help='Set true to open a window for result visualization. This flag is invalid when using camera.')
|
| 40 |
args = parser.parse_args()
|
| 41 |
|
| 42 |
|
| 43 |
def visualize(image, hands, print_result=False):
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
for idx, handpose in enumerate(hands):
|
| 47 |
conf = handpose[-1]
|
| 48 |
bbox = handpose[0:4].astype(np.int32)
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
# Print results
|
| 52 |
if print_result:
|
| 53 |
print('-----------hand {}-----------'.format(idx + 1))
|
| 54 |
print('conf: {:.2f}'.format(conf))
|
|
|
|
| 55 |
print('hand box: {}'.format(bbox))
|
| 56 |
print('hand landmarks: ')
|
| 57 |
-
for l in
|
|
|
|
|
|
|
|
|
|
| 58 |
print('\t{}'.format(l))
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
# Draw line between each key points
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
|
| 92 |
if __name__ == '__main__':
|
| 93 |
# palm detector
|
| 94 |
palm_detector = MPPalmDet(modelPath='../palm_detection_mediapipe/palm_detection_mediapipe_2023feb.onnx',
|
| 95 |
nmsThreshold=0.3,
|
| 96 |
-
scoreThreshold=0.
|
| 97 |
backendId=args.backend,
|
| 98 |
targetId=args.target)
|
| 99 |
# handpose detector
|
|
@@ -108,7 +165,7 @@ if __name__ == '__main__':
|
|
| 108 |
|
| 109 |
# Palm detector inference
|
| 110 |
palms = palm_detector.infer(image)
|
| 111 |
-
hands = np.empty(shape=(0,
|
| 112 |
|
| 113 |
# Estimate the pose of each hand
|
| 114 |
for palm in palms:
|
|
@@ -117,10 +174,12 @@ if __name__ == '__main__':
|
|
| 117 |
if handpose is not None:
|
| 118 |
hands = np.vstack((hands, handpose))
|
| 119 |
# Draw results on the input image
|
| 120 |
-
image = visualize(image, hands, True)
|
| 121 |
|
| 122 |
if len(palms) == 0:
|
| 123 |
print('No palm detected!')
|
|
|
|
|
|
|
| 124 |
|
| 125 |
# Save results
|
| 126 |
if args.save:
|
|
@@ -131,6 +190,7 @@ if __name__ == '__main__':
|
|
| 131 |
if args.vis:
|
| 132 |
cv.namedWindow(args.input, cv.WINDOW_AUTOSIZE)
|
| 133 |
cv.imshow(args.input, image)
|
|
|
|
| 134 |
cv.waitKey(0)
|
| 135 |
else: # Omit input to call default camera
|
| 136 |
deviceId = 0
|
|
@@ -145,7 +205,7 @@ if __name__ == '__main__':
|
|
| 145 |
|
| 146 |
# Palm detector inference
|
| 147 |
palms = palm_detector.infer(frame)
|
| 148 |
-
hands = np.empty(shape=(0,
|
| 149 |
|
| 150 |
tm.start()
|
| 151 |
# Estimate the pose of each hand
|
|
@@ -156,12 +216,14 @@ if __name__ == '__main__':
|
|
| 156 |
hands = np.vstack((hands, handpose))
|
| 157 |
tm.stop()
|
| 158 |
# Draw results on the input image
|
| 159 |
-
frame = visualize(frame, hands)
|
| 160 |
|
| 161 |
if len(palms) == 0:
|
| 162 |
print('No palm detected!')
|
| 163 |
else:
|
|
|
|
| 164 |
cv.putText(frame, 'FPS: {:.2f}'.format(tm.getFPS()), (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255))
|
| 165 |
|
| 166 |
cv.imshow('MediaPipe Handpose Detection Demo', frame)
|
|
|
|
| 167 |
tm.reset()
|
|
|
|
| 31 |
|
| 32 |
parser = argparse.ArgumentParser(description='Hand Pose Estimation from MediaPipe')
|
| 33 |
parser.add_argument('--input', '-i', type=str, help='Path to the input image. Omit for using default camera.')
|
| 34 |
+
parser.add_argument('--model', '-m', type=str, default='./handpose_estimation_mediapipe_2023feb.onnx', help='Path to the model.')
|
| 35 |
parser.add_argument('--backend', '-b', type=int, default=backends[0], help=help_msg_backends.format(*backends))
|
| 36 |
parser.add_argument('--target', '-t', type=int, default=targets[0], help=help_msg_targets.format(*targets))
|
| 37 |
+
parser.add_argument('--conf_threshold', type=float, default=0.9, help='Filter out hands of confidence < conf_threshold.')
|
| 38 |
parser.add_argument('--save', '-s', type=str, default=False, help='Set true to save results. This flag is invalid when using camera.')
|
| 39 |
parser.add_argument('--vis', '-v', type=str2bool, default=True, help='Set true to open a window for result visualization. This flag is invalid when using camera.')
|
| 40 |
args = parser.parse_args()
|
| 41 |
|
| 42 |
|
| 43 |
def visualize(image, hands, print_result=False):
|
| 44 |
+
display_screen = image.copy()
|
| 45 |
+
display_3d = np.zeros((400, 400, 3), np.uint8)
|
| 46 |
+
cv.line(display_3d, (200, 0), (200, 400), (255, 255, 255), 2)
|
| 47 |
+
cv.line(display_3d, (0, 200), (400, 200), (255, 255, 255), 2)
|
| 48 |
+
cv.putText(display_3d, 'Main View', (0, 12), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255))
|
| 49 |
+
cv.putText(display_3d, 'Top View', (200, 12), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255))
|
| 50 |
+
cv.putText(display_3d, 'Left View', (0, 212), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255))
|
| 51 |
+
cv.putText(display_3d, 'Right View', (200, 212), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255))
|
| 52 |
+
is_draw = False # ensure only one hand is drawn
|
| 53 |
+
|
| 54 |
+
def draw_lines(image, landmarks, is_draw_point=True, thickness=2):
|
| 55 |
+
cv.line(image, landmarks[0], landmarks[1], (255, 255, 255), thickness)
|
| 56 |
+
cv.line(image, landmarks[1], landmarks[2], (255, 255, 255), thickness)
|
| 57 |
+
cv.line(image, landmarks[2], landmarks[3], (255, 255, 255), thickness)
|
| 58 |
+
cv.line(image, landmarks[3], landmarks[4], (255, 255, 255), thickness)
|
| 59 |
+
|
| 60 |
+
cv.line(image, landmarks[0], landmarks[5], (255, 255, 255), thickness)
|
| 61 |
+
cv.line(image, landmarks[5], landmarks[6], (255, 255, 255), thickness)
|
| 62 |
+
cv.line(image, landmarks[6], landmarks[7], (255, 255, 255), thickness)
|
| 63 |
+
cv.line(image, landmarks[7], landmarks[8], (255, 255, 255), thickness)
|
| 64 |
+
|
| 65 |
+
cv.line(image, landmarks[0], landmarks[9], (255, 255, 255), thickness)
|
| 66 |
+
cv.line(image, landmarks[9], landmarks[10], (255, 255, 255), thickness)
|
| 67 |
+
cv.line(image, landmarks[10], landmarks[11], (255, 255, 255), thickness)
|
| 68 |
+
cv.line(image, landmarks[11], landmarks[12], (255, 255, 255), thickness)
|
| 69 |
+
|
| 70 |
+
cv.line(image, landmarks[0], landmarks[13], (255, 255, 255), thickness)
|
| 71 |
+
cv.line(image, landmarks[13], landmarks[14], (255, 255, 255), thickness)
|
| 72 |
+
cv.line(image, landmarks[14], landmarks[15], (255, 255, 255), thickness)
|
| 73 |
+
cv.line(image, landmarks[15], landmarks[16], (255, 255, 255), thickness)
|
| 74 |
+
|
| 75 |
+
cv.line(image, landmarks[0], landmarks[17], (255, 255, 255), thickness)
|
| 76 |
+
cv.line(image, landmarks[17], landmarks[18], (255, 255, 255), thickness)
|
| 77 |
+
cv.line(image, landmarks[18], landmarks[19], (255, 255, 255), thickness)
|
| 78 |
+
cv.line(image, landmarks[19], landmarks[20], (255, 255, 255), thickness)
|
| 79 |
+
|
| 80 |
+
if is_draw_point:
|
| 81 |
+
for p in landmarks:
|
| 82 |
+
cv.circle(image, p, thickness, (0, 0, 255), -1)
|
| 83 |
|
| 84 |
for idx, handpose in enumerate(hands):
|
| 85 |
conf = handpose[-1]
|
| 86 |
bbox = handpose[0:4].astype(np.int32)
|
| 87 |
+
handedness = handpose[-2]
|
| 88 |
+
if handedness <= 0.5:
|
| 89 |
+
handedness_text = 'Left'
|
| 90 |
+
else:
|
| 91 |
+
handedness_text = 'Right'
|
| 92 |
+
landmarks_screen = handpose[4:67].reshape(21, 3).astype(np.int32)
|
| 93 |
+
landmarks_word = handpose[67:130].reshape(21, 3)
|
| 94 |
|
| 95 |
# Print results
|
| 96 |
if print_result:
|
| 97 |
print('-----------hand {}-----------'.format(idx + 1))
|
| 98 |
print('conf: {:.2f}'.format(conf))
|
| 99 |
+
print('handedness: {}'.format(handedness_text))
|
| 100 |
print('hand box: {}'.format(bbox))
|
| 101 |
print('hand landmarks: ')
|
| 102 |
+
for l in landmarks_screen:
|
| 103 |
+
print('\t{}'.format(l))
|
| 104 |
+
print('hand world landmarks: ')
|
| 105 |
+
for l in landmarks_word:
|
| 106 |
print('\t{}'.format(l))
|
| 107 |
|
| 108 |
+
# draw box
|
| 109 |
+
cv.rectangle(display_screen, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 2)
|
| 110 |
+
# draw handedness
|
| 111 |
+
cv.putText(display_screen, '{}'.format(handedness_text), (bbox[0], bbox[1] + 12), cv.FONT_HERSHEY_DUPLEX, 0.5, (0, 0, 255))
|
| 112 |
# Draw line between each key points
|
| 113 |
+
landmarks_xy = landmarks_screen[:, 0:2]
|
| 114 |
+
draw_lines(display_screen, landmarks_xy, is_draw_point=False)
|
| 115 |
+
|
| 116 |
+
# z value is relative to WRIST
|
| 117 |
+
for p in landmarks_screen:
|
| 118 |
+
r = max(5 - p[2] // 5, 0)
|
| 119 |
+
r = min(r, 14)
|
| 120 |
+
cv.circle(display_screen, np.array([p[0], p[1]]), r, (0, 0, 255), -1)
|
| 121 |
+
|
| 122 |
+
if is_draw is False:
|
| 123 |
+
is_draw = True
|
| 124 |
+
# Main view
|
| 125 |
+
landmarks_xy = landmarks_word[:, [0, 1]]
|
| 126 |
+
landmarks_xy = (landmarks_xy * 1000 + 100).astype(np.int32)
|
| 127 |
+
draw_lines(display_3d, landmarks_xy, thickness=5)
|
| 128 |
+
|
| 129 |
+
# Top view
|
| 130 |
+
landmarks_xz = landmarks_word[:, [0, 2]]
|
| 131 |
+
landmarks_xz[:, 1] = -landmarks_xz[:, 1]
|
| 132 |
+
landmarks_xz = (landmarks_xz * 1000 + np.array([300, 100])).astype(np.int32)
|
| 133 |
+
draw_lines(display_3d, landmarks_xz, thickness=5)
|
| 134 |
+
|
| 135 |
+
# Left view
|
| 136 |
+
landmarks_yz = landmarks_word[:, [2, 1]]
|
| 137 |
+
landmarks_yz[:, 0] = -landmarks_yz[:, 0]
|
| 138 |
+
landmarks_yz = (landmarks_yz * 1000 + np.array([100, 300])).astype(np.int32)
|
| 139 |
+
draw_lines(display_3d, landmarks_yz, thickness=5)
|
| 140 |
+
|
| 141 |
+
# Right view
|
| 142 |
+
landmarks_zy = landmarks_word[:, [2, 1]]
|
| 143 |
+
landmarks_zy = (landmarks_zy * 1000 + np.array([300, 300])).astype(np.int32)
|
| 144 |
+
draw_lines(display_3d, landmarks_zy, thickness=5)
|
| 145 |
+
|
| 146 |
+
return display_screen, display_3d
|
| 147 |
|
| 148 |
|
| 149 |
if __name__ == '__main__':
|
| 150 |
# palm detector
|
| 151 |
palm_detector = MPPalmDet(modelPath='../palm_detection_mediapipe/palm_detection_mediapipe_2023feb.onnx',
|
| 152 |
nmsThreshold=0.3,
|
| 153 |
+
scoreThreshold=0.6,
|
| 154 |
backendId=args.backend,
|
| 155 |
targetId=args.target)
|
| 156 |
# handpose detector
|
|
|
|
| 165 |
|
| 166 |
# Palm detector inference
|
| 167 |
palms = palm_detector.infer(image)
|
| 168 |
+
hands = np.empty(shape=(0, 132))
|
| 169 |
|
| 170 |
# Estimate the pose of each hand
|
| 171 |
for palm in palms:
|
|
|
|
| 174 |
if handpose is not None:
|
| 175 |
hands = np.vstack((hands, handpose))
|
| 176 |
# Draw results on the input image
|
| 177 |
+
image, view_3d = visualize(image, hands, True)
|
| 178 |
|
| 179 |
if len(palms) == 0:
|
| 180 |
print('No palm detected!')
|
| 181 |
+
else:
|
| 182 |
+
print('Palm detected!')
|
| 183 |
|
| 184 |
# Save results
|
| 185 |
if args.save:
|
|
|
|
| 190 |
if args.vis:
|
| 191 |
cv.namedWindow(args.input, cv.WINDOW_AUTOSIZE)
|
| 192 |
cv.imshow(args.input, image)
|
| 193 |
+
cv.imshow('3D HandPose Demo', view_3d)
|
| 194 |
cv.waitKey(0)
|
| 195 |
else: # Omit input to call default camera
|
| 196 |
deviceId = 0
|
|
|
|
| 205 |
|
| 206 |
# Palm detector inference
|
| 207 |
palms = palm_detector.infer(frame)
|
| 208 |
+
hands = np.empty(shape=(0, 132))
|
| 209 |
|
| 210 |
tm.start()
|
| 211 |
# Estimate the pose of each hand
|
|
|
|
| 216 |
hands = np.vstack((hands, handpose))
|
| 217 |
tm.stop()
|
| 218 |
# Draw results on the input image
|
| 219 |
+
frame, view_3d = visualize(frame, hands)
|
| 220 |
|
| 221 |
if len(palms) == 0:
|
| 222 |
print('No palm detected!')
|
| 223 |
else:
|
| 224 |
+
print('Palm detected!')
|
| 225 |
cv.putText(frame, 'FPS: {:.2f}'.format(tm.getFPS()), (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255))
|
| 226 |
|
| 227 |
cv.imshow('MediaPipe Handpose Detection Demo', frame)
|
| 228 |
+
cv.imshow('3D HandPose Demo', view_3d)
|
| 229 |
tm.reset()
|
models/handpose_estimation_mediapipe/mp_handpose.py
CHANGED
|
@@ -9,7 +9,7 @@ class MPHandPose:
|
|
| 9 |
self.backend_id = backendId
|
| 10 |
self.target_id = targetId
|
| 11 |
|
| 12 |
-
self.input_size = np.array([
|
| 13 |
self.PALM_LANDMARK_IDS = [0, 5, 9, 13, 17, 1, 2]
|
| 14 |
self.PALM_LANDMARKS_INDEX_OF_PALM_BASE = 0
|
| 15 |
self.PALM_LANDMARKS_INDEX_OF_MIDDLE_FINGER_BASE = 2
|
|
@@ -115,20 +115,25 @@ class MPHandPose:
|
|
| 115 |
return results # [bbox_coords, landmarks_coords, conf]
|
| 116 |
|
| 117 |
def _postprocess(self, blob, rotated_palm_bbox, angle, rotation_matrix):
|
| 118 |
-
landmarks, conf = blob
|
| 119 |
|
|
|
|
| 120 |
if conf < self.conf_threshold:
|
| 121 |
return None
|
| 122 |
|
| 123 |
-
landmarks = landmarks.reshape(-1, 3) # shape: (1, 63) -> (21, 3)
|
|
|
|
| 124 |
|
| 125 |
# transform coords back to the input coords
|
| 126 |
wh_rotated_palm_bbox = rotated_palm_bbox[1] - rotated_palm_bbox[0]
|
| 127 |
scale_factor = wh_rotated_palm_bbox / self.input_size
|
| 128 |
landmarks[:, :2] = (landmarks[:, :2] - self.input_size / 2) * scale_factor
|
|
|
|
| 129 |
coords_rotation_matrix = cv.getRotationMatrix2D((0, 0), angle, 1.0)
|
| 130 |
rotated_landmarks = np.dot(landmarks[:, :2], coords_rotation_matrix[:, :2])
|
| 131 |
rotated_landmarks = np.c_[rotated_landmarks, landmarks[:, 2]]
|
|
|
|
|
|
|
| 132 |
# invert rotation
|
| 133 |
rotation_component = np.array([
|
| 134 |
[rotation_matrix[0][0], rotation_matrix[1][0]],
|
|
@@ -144,12 +149,12 @@ class MPHandPose:
|
|
| 144 |
original_center = np.array([
|
| 145 |
np.dot(center, inverse_rotation_matrix[0]),
|
| 146 |
np.dot(center, inverse_rotation_matrix[1])])
|
| 147 |
-
landmarks = rotated_landmarks[:, :2] + original_center
|
| 148 |
|
| 149 |
# get bounding box from rotated_landmarks
|
| 150 |
bbox = np.array([
|
| 151 |
-
np.amin(landmarks, axis=0),
|
| 152 |
-
np.amax(landmarks, axis=0)]) # [top-left, bottom-right]
|
| 153 |
# shift bounding box
|
| 154 |
wh_bbox = bbox[1] - bbox[0]
|
| 155 |
shift_vector = self.HAND_BOX_SHIFT_VECTOR * wh_bbox
|
|
@@ -162,4 +167,9 @@ class MPHandPose:
|
|
| 162 |
center_bbox - new_half_size,
|
| 163 |
center_bbox + new_half_size])
|
| 164 |
|
| 165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
self.backend_id = backendId
|
| 10 |
self.target_id = targetId
|
| 11 |
|
| 12 |
+
self.input_size = np.array([224, 224]) # wh
|
| 13 |
self.PALM_LANDMARK_IDS = [0, 5, 9, 13, 17, 1, 2]
|
| 14 |
self.PALM_LANDMARKS_INDEX_OF_PALM_BASE = 0
|
| 15 |
self.PALM_LANDMARKS_INDEX_OF_MIDDLE_FINGER_BASE = 2
|
|
|
|
| 115 |
return results # [bbox_coords, landmarks_coords, conf]
|
| 116 |
|
| 117 |
def _postprocess(self, blob, rotated_palm_bbox, angle, rotation_matrix):
|
| 118 |
+
landmarks, conf, handedness, landmarks_word = blob
|
| 119 |
|
| 120 |
+
conf = conf[0][0]
|
| 121 |
if conf < self.conf_threshold:
|
| 122 |
return None
|
| 123 |
|
| 124 |
+
landmarks = landmarks[0].reshape(-1, 3) # shape: (1, 63) -> (21, 3)
|
| 125 |
+
landmarks_word = landmarks_word[0].reshape(-1, 3) # shape: (1, 63) -> (21, 3)
|
| 126 |
|
| 127 |
# transform coords back to the input coords
|
| 128 |
wh_rotated_palm_bbox = rotated_palm_bbox[1] - rotated_palm_bbox[0]
|
| 129 |
scale_factor = wh_rotated_palm_bbox / self.input_size
|
| 130 |
landmarks[:, :2] = (landmarks[:, :2] - self.input_size / 2) * scale_factor
|
| 131 |
+
landmarks[:, 2] = landmarks[:, 2] * max(scale_factor) # depth scaling
|
| 132 |
coords_rotation_matrix = cv.getRotationMatrix2D((0, 0), angle, 1.0)
|
| 133 |
rotated_landmarks = np.dot(landmarks[:, :2], coords_rotation_matrix[:, :2])
|
| 134 |
rotated_landmarks = np.c_[rotated_landmarks, landmarks[:, 2]]
|
| 135 |
+
rotated_landmarks_world = np.dot(landmarks_word[:, :2], coords_rotation_matrix[:, :2])
|
| 136 |
+
rotated_landmarks_world = np.c_[rotated_landmarks_world, landmarks_word[:, 2]]
|
| 137 |
# invert rotation
|
| 138 |
rotation_component = np.array([
|
| 139 |
[rotation_matrix[0][0], rotation_matrix[1][0]],
|
|
|
|
| 149 |
original_center = np.array([
|
| 150 |
np.dot(center, inverse_rotation_matrix[0]),
|
| 151 |
np.dot(center, inverse_rotation_matrix[1])])
|
| 152 |
+
landmarks[:, :2] = rotated_landmarks[:, :2] + original_center
|
| 153 |
|
| 154 |
# get bounding box from rotated_landmarks
|
| 155 |
bbox = np.array([
|
| 156 |
+
np.amin(landmarks[:, :2], axis=0),
|
| 157 |
+
np.amax(landmarks[:, :2], axis=0)]) # [top-left, bottom-right]
|
| 158 |
# shift bounding box
|
| 159 |
wh_bbox = bbox[1] - bbox[0]
|
| 160 |
shift_vector = self.HAND_BOX_SHIFT_VECTOR * wh_bbox
|
|
|
|
| 167 |
center_bbox - new_half_size,
|
| 168 |
center_bbox + new_half_size])
|
| 169 |
|
| 170 |
+
# [0: 4]: hand bounding box found in image of format [x1, y1, x2, y2] (top-left and bottom-right points)
|
| 171 |
+
# [4: 67]: screen landmarks with format [x1, y1, z1, x2, y2 ... x21, y21, z21], z value is relative to WRIST
|
| 172 |
+
# [67: 130]: world landmarks with format [x1, y1, z1, x2, y2 ... x21, y21, z21], 3D metric x, y, z coordinate
|
| 173 |
+
# [130]: handedness, (left)[0, 1](right) hand
|
| 174 |
+
# [131]: confidence
|
| 175 |
+
return np.r_[bbox.reshape(-1), landmarks.reshape(-1), rotated_landmarks_world.reshape(-1), handedness[0][0], conf]
|
models/palm_detection_mediapipe/README.md
CHANGED
|
@@ -7,6 +7,9 @@ This model detects palm bounding boxes and palm landmarks, and is converted from
|
|
| 7 |
- SSD Anchors are generated from [GenMediaPipePalmDectionSSDAnchors](https://github.com/VimalMollyn/GenMediaPipePalmDectionSSDAnchors)
|
| 8 |
|
| 9 |
|
|
|
|
|
|
|
|
|
|
| 10 |
## Demo
|
| 11 |
|
| 12 |
Run the following commands to try the demo:
|
|
|
|
| 7 |
- SSD Anchors are generated from [GenMediaPipePalmDectionSSDAnchors](https://github.com/VimalMollyn/GenMediaPipePalmDectionSSDAnchors)
|
| 8 |
|
| 9 |
|
| 10 |
+
**Note**:
|
| 11 |
+
- Visit https://google.github.io/mediapipe/solutions/models.html#hands for models of larger scale.
|
| 12 |
+
|
| 13 |
## Demo
|
| 14 |
|
| 15 |
Run the following commands to try the demo:
|
tools/quantize/README.md
CHANGED
|
@@ -54,4 +54,4 @@ python quantize-inc.py model1
|
|
| 54 |
|
| 55 |
## Dataset
|
| 56 |
Some models are quantized with extra datasets.
|
| 57 |
-
- [MP-PalmDet](../../models/palm_detection_mediapipe)
|
|
|
|
| 54 |
|
| 55 |
## Dataset
|
| 56 |
Some models are quantized with extra datasets.
|
| 57 |
+
- [MP-PalmDet](../../models/palm_detection_mediapipe) and [MP-HandPose](../../models/handpose_estimation_mediapipe) are quantized with evaluation set of [FreiHAND](https://lmb.informatik.uni-freiburg.de/resources/datasets/FreihandDataset.en.html). Download the dataset from [this link](https://lmb.informatik.uni-freiburg.de/data/freihand/FreiHAND_pub_v2_eval.zip). Unpack it and replace `path/to/dataset` with the path to `FreiHAND_pub_v2_eval/evaluation/rgb`.
|
tools/quantize/quantize-ort.py
CHANGED
|
@@ -14,7 +14,7 @@ from onnx import version_converter
|
|
| 14 |
import onnxruntime
|
| 15 |
from onnxruntime.quantization import quantize_static, CalibrationDataReader, QuantType, QuantFormat
|
| 16 |
|
| 17 |
-
from transform import Compose, Resize, CenterCrop, Normalize, ColorConvert
|
| 18 |
|
| 19 |
class DataReader(CalibrationDataReader):
|
| 20 |
def __init__(self, model_path, image_dir, transforms, data_dim):
|
|
@@ -37,6 +37,8 @@ class DataReader(CalibrationDataReader):
|
|
| 37 |
continue
|
| 38 |
img = cv.imread(os.path.join(image_dir, image_name))
|
| 39 |
img = self.transforms(img)
|
|
|
|
|
|
|
| 40 |
blob = cv.dnn.blobFromImage(img)
|
| 41 |
if self.data_dim == 'hwc':
|
| 42 |
blob = cv.transposeND(blob, [0, 2, 3, 1])
|
|
@@ -110,7 +112,10 @@ models=dict(
|
|
| 110 |
calibration_image_dir='path/to/dataset',
|
| 111 |
transforms=Compose([Resize(size=(192, 192)), Normalize(std=[255, 255, 255]),
|
| 112 |
ColorConvert(ctype=cv.COLOR_BGR2RGB)]), data_dim='hwc'),
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
| 114 |
)
|
| 115 |
|
| 116 |
if __name__ == '__main__':
|
|
|
|
| 14 |
import onnxruntime
|
| 15 |
from onnxruntime.quantization import quantize_static, CalibrationDataReader, QuantType, QuantFormat
|
| 16 |
|
| 17 |
+
from transform import Compose, Resize, CenterCrop, Normalize, ColorConvert, HandAlign
|
| 18 |
|
| 19 |
class DataReader(CalibrationDataReader):
|
| 20 |
def __init__(self, model_path, image_dir, transforms, data_dim):
|
|
|
|
| 37 |
continue
|
| 38 |
img = cv.imread(os.path.join(image_dir, image_name))
|
| 39 |
img = self.transforms(img)
|
| 40 |
+
if img is None:
|
| 41 |
+
continue
|
| 42 |
blob = cv.dnn.blobFromImage(img)
|
| 43 |
if self.data_dim == 'hwc':
|
| 44 |
blob = cv.transposeND(blob, [0, 2, 3, 1])
|
|
|
|
| 112 |
calibration_image_dir='path/to/dataset',
|
| 113 |
transforms=Compose([Resize(size=(192, 192)), Normalize(std=[255, 255, 255]),
|
| 114 |
ColorConvert(ctype=cv.COLOR_BGR2RGB)]), data_dim='hwc'),
|
| 115 |
+
mp_handpose=Quantize(model_path='../../models/handpose_estimation_mediapipe/handpose_estimation_mediapipe_2023feb.onnx',
|
| 116 |
+
calibration_image_dir='path/to/dataset',
|
| 117 |
+
transforms=Compose([HandAlign("mp_handpose"), Resize(size=(224, 224)), Normalize(std=[255, 255, 255]),
|
| 118 |
+
ColorConvert(ctype=cv.COLOR_BGR2RGB)]), data_dim='hwc'),
|
| 119 |
)
|
| 120 |
|
| 121 |
if __name__ == '__main__':
|
tools/quantize/transform.py
CHANGED
|
@@ -5,8 +5,9 @@
|
|
| 5 |
# Third party copyrights are property of their respective owners.
|
| 6 |
|
| 7 |
import collections
|
| 8 |
-
import numpy as
|
| 9 |
import cv2 as cv
|
|
|
|
| 10 |
|
| 11 |
class Compose:
|
| 12 |
def __init__(self, transforms=[]):
|
|
@@ -15,6 +16,8 @@ class Compose:
|
|
| 15 |
def __call__(self, img):
|
| 16 |
for t in self.transforms:
|
| 17 |
img = t(img)
|
|
|
|
|
|
|
| 18 |
return img
|
| 19 |
|
| 20 |
class Resize:
|
|
@@ -58,3 +61,69 @@ class ColorConvert:
|
|
| 58 |
|
| 59 |
def __call__(self, img):
|
| 60 |
return cv.cvtColor(img, self.ctype)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
# Third party copyrights are property of their respective owners.
|
| 6 |
|
| 7 |
import collections
|
| 8 |
+
import numpy as np
|
| 9 |
import cv2 as cv
|
| 10 |
+
import sys
|
| 11 |
|
| 12 |
class Compose:
|
| 13 |
def __init__(self, transforms=[]):
|
|
|
|
| 16 |
def __call__(self, img):
|
| 17 |
for t in self.transforms:
|
| 18 |
img = t(img)
|
| 19 |
+
if img is None:
|
| 20 |
+
break
|
| 21 |
return img
|
| 22 |
|
| 23 |
class Resize:
|
|
|
|
| 61 |
|
| 62 |
def __call__(self, img):
|
| 63 |
return cv.cvtColor(img, self.ctype)
|
| 64 |
+
|
| 65 |
+
class HandAlign:
|
| 66 |
+
def __init__(self, model):
|
| 67 |
+
self.model = model
|
| 68 |
+
sys.path.append('../../models/palm_detection_mediapipe')
|
| 69 |
+
from mp_palmdet import MPPalmDet
|
| 70 |
+
self.palm_detector = MPPalmDet(modelPath='../../models/palm_detection_mediapipe/palm_detection_mediapipe_2023feb.onnx', nmsThreshold=0.3, scoreThreshold=0.9)
|
| 71 |
+
|
| 72 |
+
def __call__(self, img):
|
| 73 |
+
return self.mp_handpose_align(img)
|
| 74 |
+
|
| 75 |
+
def mp_handpose_align(self, img):
|
| 76 |
+
palms = self.palm_detector.infer(img)
|
| 77 |
+
if len(palms) == 0:
|
| 78 |
+
return None
|
| 79 |
+
palm = palms[0]
|
| 80 |
+
palm_bbox = palm[0:4].reshape(2, 2)
|
| 81 |
+
palm_landmarks = palm[4:18].reshape(7, 2)
|
| 82 |
+
p1 = palm_landmarks[0]
|
| 83 |
+
p2 = palm_landmarks[2]
|
| 84 |
+
radians = np.pi / 2 - np.arctan2(-(p2[1] - p1[1]), p2[0] - p1[0])
|
| 85 |
+
radians = radians - 2 * np.pi * np.floor((radians + np.pi) / (2 * np.pi))
|
| 86 |
+
angle = np.rad2deg(radians)
|
| 87 |
+
# get bbox center
|
| 88 |
+
center_palm_bbox = np.sum(palm_bbox, axis=0) / 2
|
| 89 |
+
# get rotation matrix
|
| 90 |
+
rotation_matrix = cv.getRotationMatrix2D(center_palm_bbox, angle, 1.0)
|
| 91 |
+
# get rotated image
|
| 92 |
+
rotated_image = cv.warpAffine(img, rotation_matrix, (img.shape[1], img.shape[0]))
|
| 93 |
+
# get bounding boxes from rotated palm landmarks
|
| 94 |
+
homogeneous_coord = np.c_[palm_landmarks, np.ones(palm_landmarks.shape[0])]
|
| 95 |
+
rotated_palm_landmarks = np.array([
|
| 96 |
+
np.dot(homogeneous_coord, rotation_matrix[0]),
|
| 97 |
+
np.dot(homogeneous_coord, rotation_matrix[1])])
|
| 98 |
+
# get landmark bounding box
|
| 99 |
+
rotated_palm_bbox = np.array([
|
| 100 |
+
np.amin(rotated_palm_landmarks, axis=1),
|
| 101 |
+
np.amax(rotated_palm_landmarks, axis=1)]) # [top-left, bottom-right]
|
| 102 |
+
|
| 103 |
+
# shift bounding box
|
| 104 |
+
wh_rotated_palm_bbox = rotated_palm_bbox[1] - rotated_palm_bbox[0]
|
| 105 |
+
shift_vector = [0, -0.1] * wh_rotated_palm_bbox
|
| 106 |
+
rotated_palm_bbox = rotated_palm_bbox + shift_vector
|
| 107 |
+
# squarify bounding boxx
|
| 108 |
+
center_rotated_plam_bbox = np.sum(rotated_palm_bbox, axis=0) / 2
|
| 109 |
+
wh_rotated_palm_bbox = rotated_palm_bbox[1] - rotated_palm_bbox[0]
|
| 110 |
+
new_half_size = np.amax(wh_rotated_palm_bbox) / 2
|
| 111 |
+
rotated_palm_bbox = np.array([
|
| 112 |
+
center_rotated_plam_bbox - new_half_size,
|
| 113 |
+
center_rotated_plam_bbox + new_half_size])
|
| 114 |
+
|
| 115 |
+
# enlarge bounding box
|
| 116 |
+
center_rotated_plam_bbox = np.sum(rotated_palm_bbox, axis=0) / 2
|
| 117 |
+
wh_rotated_palm_bbox = rotated_palm_bbox[1] - rotated_palm_bbox[0]
|
| 118 |
+
new_half_size = wh_rotated_palm_bbox * 1.5
|
| 119 |
+
rotated_palm_bbox = np.array([
|
| 120 |
+
center_rotated_plam_bbox - new_half_size,
|
| 121 |
+
center_rotated_plam_bbox + new_half_size])
|
| 122 |
+
|
| 123 |
+
# Crop the rotated image by the bounding box
|
| 124 |
+
[[x1, y1], [x2, y2]] = rotated_palm_bbox.astype(np.int32)
|
| 125 |
+
diff = np.maximum([-x1, -y1, x2 - rotated_image.shape[1], y2 - rotated_image.shape[0]], 0)
|
| 126 |
+
[x1, y1, x2, y2] = [x1, y1, x2, y2] + diff
|
| 127 |
+
crop = rotated_image[y1:y2, x1:x2, :]
|
| 128 |
+
crop = cv.copyMakeBorder(crop, diff[1], diff[3], diff[0], diff[2], cv.BORDER_CONSTANT, value=(0, 0, 0))
|
| 129 |
+
return crop
|