Ryan Lee
commited on
Commit
·
18b09e3
1
Parent(s):
4f25323
C++ Demo - Object Detection (NanoDet) (#232)
Browse files* Functional and Refactored demo.cpp for MobileNet
* Fix inference timer text, added timer reset.
* Updated README.md, remove FPS text for single image processing
* Add matching saved image message
* Removing inference time printout for video inputs
* Update FPS text to 2 decimal places
* Update coding style for braces
* Address PR comments. Adjusted to C++11 standard.
* Addressed PR comments. Added C++11 cmake configuration, extracted classID and confidence function, and use NMSBoxesBatched now.
- CMakeLists.txt +32 -0
- README.md +19 -0
- demo.cpp +508 -0
CMakeLists.txt
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
cmake_minimum_required(VERSION 3.24)
|
| 2 |
+
set(project_name "opencv_zoo_object_detection_nanodet")
|
| 3 |
+
|
| 4 |
+
PROJECT (${project_name})
|
| 5 |
+
|
| 6 |
+
set(OPENCV_VERSION "4.9.0")
|
| 7 |
+
set(OPENCV_INSTALLATION_PATH "" CACHE PATH "Where to look for OpenCV installation")
|
| 8 |
+
find_package(OpenCV ${OPENCV_VERSION} REQUIRED HINTS ${OPENCV_INSTALLATION_PATH})
|
| 9 |
+
# Find OpenCV, you may need to set OpenCV_DIR variable
|
| 10 |
+
# to the absolute path to the directory containing OpenCVConfig.cmake file
|
| 11 |
+
# via the command line or GUI
|
| 12 |
+
|
| 13 |
+
file(GLOB SourceFile
|
| 14 |
+
"demo.cpp")
|
| 15 |
+
# If the package has been found, several variables will
|
| 16 |
+
# be set, you can find the full list with descriptions
|
| 17 |
+
# in the OpenCVConfig.cmake file.
|
| 18 |
+
# Print some message showing some of them
|
| 19 |
+
message(STATUS "OpenCV library status:")
|
| 20 |
+
message(STATUS " config: ${OpenCV_DIR}")
|
| 21 |
+
message(STATUS " version: ${OpenCV_VERSION}")
|
| 22 |
+
message(STATUS " libraries: ${OpenCV_LIBS}")
|
| 23 |
+
message(STATUS " include path: ${OpenCV_INCLUDE_DIRS}")
|
| 24 |
+
|
| 25 |
+
# Declare the executable target built from your sources
|
| 26 |
+
add_executable(${project_name} ${SourceFile})
|
| 27 |
+
|
| 28 |
+
# Set C++ compilation standard to C++11
|
| 29 |
+
set(CMAKE_CXX_STANDARD 11)
|
| 30 |
+
|
| 31 |
+
# Link your application with OpenCV libraries
|
| 32 |
+
target_link_libraries(${project_name} PRIVATE ${OpenCV_LIBS})
|
README.md
CHANGED
|
@@ -7,6 +7,8 @@ Note:
|
|
| 7 |
|
| 8 |
## Demo
|
| 9 |
|
|
|
|
|
|
|
| 10 |
Run the following command to try the demo:
|
| 11 |
```shell
|
| 12 |
# detect on camera input
|
|
@@ -17,6 +19,23 @@ python demo.py --input /path/to/image -v
|
|
| 17 |
Note:
|
| 18 |
- image result saved as "result.jpg"
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
## Results
|
| 22 |
|
|
|
|
| 7 |
|
| 8 |
## Demo
|
| 9 |
|
| 10 |
+
### Python
|
| 11 |
+
|
| 12 |
Run the following command to try the demo:
|
| 13 |
```shell
|
| 14 |
# detect on camera input
|
|
|
|
| 19 |
Note:
|
| 20 |
- image result saved as "result.jpg"
|
| 21 |
|
| 22 |
+
### C++
|
| 23 |
+
|
| 24 |
+
Install latest OpenCV and CMake >= 3.24.0 to get started with:
|
| 25 |
+
|
| 26 |
+
```shell
|
| 27 |
+
# A typical and default installation path of OpenCV is /usr/local
|
| 28 |
+
cmake -B build -D OPENCV_INSTALLATION_PATH=/path/to/opencv/installation .
|
| 29 |
+
cmake --build build
|
| 30 |
+
|
| 31 |
+
# detect on camera input
|
| 32 |
+
./build/opencv_zoo_object_detection_nanodet
|
| 33 |
+
# detect on an image
|
| 34 |
+
./build/opencv_zoo_object_detection_nanodet -i=/path/to/image
|
| 35 |
+
# get help messages
|
| 36 |
+
./build/opencv_zoo_object_detection_nanodet -h
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
|
| 40 |
## Results
|
| 41 |
|
demo.cpp
ADDED
|
@@ -0,0 +1,508 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <vector>
|
| 2 |
+
#include <string>
|
| 3 |
+
#include <iostream>
|
| 4 |
+
|
| 5 |
+
#include <opencv2/opencv.hpp>
|
| 6 |
+
|
| 7 |
+
using namespace std;
|
| 8 |
+
using namespace cv;
|
| 9 |
+
using namespace dnn;
|
| 10 |
+
|
| 11 |
+
const auto backendTargetPairs = vector<pair<Backend, Target>>
|
| 12 |
+
{
|
| 13 |
+
{DNN_BACKEND_OPENCV, DNN_TARGET_CPU},
|
| 14 |
+
{DNN_BACKEND_CUDA, DNN_TARGET_CUDA},
|
| 15 |
+
{DNN_BACKEND_CUDA, DNN_TARGET_CUDA_FP16},
|
| 16 |
+
{DNN_BACKEND_TIMVX, DNN_TARGET_NPU},
|
| 17 |
+
{DNN_BACKEND_CANN, DNN_TARGET_NPU}
|
| 18 |
+
};
|
| 19 |
+
|
| 20 |
+
const vector<string> nanodetClassLabels =
|
| 21 |
+
{
|
| 22 |
+
"person", "bicycle", "car", "motorcycle", "airplane", "bus",
|
| 23 |
+
"train", "truck", "boat", "traffic light", "fire hydrant",
|
| 24 |
+
"stop sign", "parking meter", "bench", "bird", "cat", "dog",
|
| 25 |
+
"horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
|
| 26 |
+
"backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
|
| 27 |
+
"skis", "snowboard", "sports ball", "kite", "baseball bat",
|
| 28 |
+
"baseball glove", "skateboard", "surfboard", "tennis racket",
|
| 29 |
+
"bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
|
| 30 |
+
"banana", "apple", "sandwich", "orange", "broccoli", "carrot",
|
| 31 |
+
"hot dog", "pizza", "donut", "cake", "chair", "couch",
|
| 32 |
+
"potted plant", "bed", "dining table", "toilet", "tv", "laptop",
|
| 33 |
+
"mouse", "remote", "keyboard", "cell phone", "microwave",
|
| 34 |
+
"oven", "toaster", "sink", "refrigerator", "book", "clock",
|
| 35 |
+
"vase", "scissors", "teddy bear", "hair drier", "toothbrush"
|
| 36 |
+
};
|
| 37 |
+
|
| 38 |
+
class NanoDet
|
| 39 |
+
{
|
| 40 |
+
public:
|
| 41 |
+
NanoDet(const String& modelPath, const float probThresh = 0.35, const float iouThresh = 0.6,
|
| 42 |
+
const Backend bId = DNN_BACKEND_DEFAULT, const Target tId = DNN_TARGET_CPU) :
|
| 43 |
+
modelPath(modelPath), probThreshold(probThresh),
|
| 44 |
+
iouThreshold(iouThresh), backendId(bId), targetId(tId),
|
| 45 |
+
imageShape(416, 416), regMax(7)
|
| 46 |
+
{
|
| 47 |
+
this->strides = { 8, 16, 32, 64 };
|
| 48 |
+
this->net = readNet(modelPath);
|
| 49 |
+
setBackendAndTarget(bId, tId);
|
| 50 |
+
this->project = Mat::zeros(1, this->regMax + 1, CV_32F);
|
| 51 |
+
for (size_t i = 0; i <= this->regMax; ++i)
|
| 52 |
+
{
|
| 53 |
+
this->project.at<float>(0, i) = static_cast<float>(i);
|
| 54 |
+
}
|
| 55 |
+
this->mean = Scalar(103.53, 116.28, 123.675);
|
| 56 |
+
this->std = Scalar(1.0 / 57.375, 1.0 / 57.12, 1.0 / 58.395);
|
| 57 |
+
this->generateAnchors();
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
void setBackendAndTarget(Backend bId, Target tId)
|
| 61 |
+
{
|
| 62 |
+
this->net.setPreferableBackend(bId);
|
| 63 |
+
this->net.setPreferableTarget(tId);
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
Mat preProcess(const Mat& inputImage)
|
| 67 |
+
{
|
| 68 |
+
Image2BlobParams paramNanodet;
|
| 69 |
+
paramNanodet.datalayout = DNN_LAYOUT_NCHW;
|
| 70 |
+
paramNanodet.ddepth = CV_32F;
|
| 71 |
+
paramNanodet.mean = this->mean;
|
| 72 |
+
paramNanodet.scalefactor = this->std;
|
| 73 |
+
paramNanodet.size = this->imageShape;
|
| 74 |
+
Mat blob;
|
| 75 |
+
blobFromImageWithParams(inputImage, blob, paramNanodet);
|
| 76 |
+
return blob;
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
Mat infer(const Mat& sourceImage)
|
| 80 |
+
{
|
| 81 |
+
Mat blob = this->preProcess(sourceImage);
|
| 82 |
+
this->net.setInput(blob);
|
| 83 |
+
vector<Mat> modelOutput;
|
| 84 |
+
this->net.forward(modelOutput, this->net.getUnconnectedOutLayersNames());
|
| 85 |
+
Mat preds = this->postProcess(modelOutput);
|
| 86 |
+
return preds;
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
Mat reshapeIfNeeded(const Mat& input)
|
| 90 |
+
{
|
| 91 |
+
if (input.dims == 3)
|
| 92 |
+
{
|
| 93 |
+
return input.reshape(0, input.size[1]);
|
| 94 |
+
}
|
| 95 |
+
return input;
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
Mat softmaxActivation(const Mat& input)
|
| 99 |
+
{
|
| 100 |
+
Mat x_exp, x_sum, x_repeat_sum, result;
|
| 101 |
+
exp(input.reshape(0, input.total() / (this->regMax + 1)), x_exp);
|
| 102 |
+
reduce(x_exp, x_sum, 1, REDUCE_SUM, CV_32F);
|
| 103 |
+
repeat(x_sum, 1, this->regMax + 1, x_repeat_sum);
|
| 104 |
+
divide(x_exp, x_repeat_sum, result);
|
| 105 |
+
return result;
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
Mat applyProjection(Mat& input)
|
| 109 |
+
{
|
| 110 |
+
Mat repeat_project;
|
| 111 |
+
repeat(this->project, input.rows, 1, repeat_project);
|
| 112 |
+
multiply(input, repeat_project, input);
|
| 113 |
+
reduce(input, input, 1, REDUCE_SUM, CV_32F);
|
| 114 |
+
Mat projection = input.col(0).clone();
|
| 115 |
+
return projection.reshape(0, projection.total() / 4);
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
void preNMS(Mat& anchors, Mat& bbox_pred, Mat& cls_score, const int nms_pre = 1000)
|
| 119 |
+
{
|
| 120 |
+
Mat max_scores;
|
| 121 |
+
reduce(cls_score, max_scores, 1, REDUCE_MAX);
|
| 122 |
+
|
| 123 |
+
Mat indices;
|
| 124 |
+
sortIdx(max_scores.t(), indices, SORT_DESCENDING);
|
| 125 |
+
|
| 126 |
+
Mat indices_float = indices.colRange(0, nms_pre);
|
| 127 |
+
Mat selected_anchors, selected_bbox_pred, selected_cls_score;
|
| 128 |
+
for (int j = 0; j < indices_float.cols; ++j)
|
| 129 |
+
{
|
| 130 |
+
selected_anchors.push_back(anchors.row(indices_float.at<int>(j)));
|
| 131 |
+
selected_bbox_pred.push_back(bbox_pred.row(indices_float.at<int>(j)));
|
| 132 |
+
selected_cls_score.push_back(cls_score.row(indices_float.at<int>(j)));
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
anchors = selected_anchors;
|
| 136 |
+
bbox_pred = selected_bbox_pred;
|
| 137 |
+
cls_score = selected_cls_score;
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
void clipBoundingBoxes(Mat& x1, Mat& y1, Mat& x2, Mat& y2)
|
| 141 |
+
{
|
| 142 |
+
Mat zeros = Mat::zeros(x1.size(), x1.type());
|
| 143 |
+
x1 = min(max(x1, zeros), Scalar(this->imageShape.width - 1));
|
| 144 |
+
y1 = min(max(y1, zeros), Scalar(this->imageShape.height - 1));
|
| 145 |
+
x2 = min(max(x2, zeros), Scalar(this->imageShape.width - 1));
|
| 146 |
+
y2 = min(max(y2, zeros), Scalar(this->imageShape.height - 1));
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
Mat calculateBoundingBoxes(const Mat& anchors, const Mat& bbox_pred)
|
| 150 |
+
{
|
| 151 |
+
Mat x1 = anchors.col(0) - bbox_pred.col(0);
|
| 152 |
+
Mat y1 = anchors.col(1) - bbox_pred.col(1);
|
| 153 |
+
Mat x2 = anchors.col(0) + bbox_pred.col(2);
|
| 154 |
+
Mat y2 = anchors.col(1) + bbox_pred.col(3);
|
| 155 |
+
|
| 156 |
+
clipBoundingBoxes(x1, y1, x2, y2);
|
| 157 |
+
|
| 158 |
+
Mat bboxes;
|
| 159 |
+
hconcat(vector<Mat>{x1, y1, x2, y2}, bboxes);
|
| 160 |
+
|
| 161 |
+
return bboxes;
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
vector<Rect2d> bboxMatToRect2d(const Mat& bboxes)
|
| 165 |
+
{
|
| 166 |
+
Mat bboxes_wh(bboxes.clone());
|
| 167 |
+
bboxes_wh.colRange(2, 4) = bboxes_wh.colRange(2, 4) -= bboxes_wh.colRange(0, 2);
|
| 168 |
+
vector<Rect2d> boxesXYXY;
|
| 169 |
+
for (size_t i = 0; i < bboxes_wh.rows; i++)
|
| 170 |
+
{
|
| 171 |
+
boxesXYXY.emplace_back(bboxes.at<float>(i, 0),
|
| 172 |
+
bboxes.at<float>(i, 1),
|
| 173 |
+
bboxes.at<float>(i, 2),
|
| 174 |
+
bboxes.at<float>(i, 3));
|
| 175 |
+
}
|
| 176 |
+
return boxesXYXY;
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
Mat postProcess(const vector<Mat>& preds)
|
| 180 |
+
{
|
| 181 |
+
vector<Mat> cls_scores, bbox_preds;
|
| 182 |
+
for (size_t i = 0; i < preds.size(); i += 2)
|
| 183 |
+
{
|
| 184 |
+
cls_scores.push_back(preds[i]);
|
| 185 |
+
bbox_preds.push_back(preds[i + 1]);
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
vector<Mat> bboxes_mlvl;
|
| 189 |
+
vector<Mat> scores_mlvl;
|
| 190 |
+
|
| 191 |
+
for (size_t i = 0; i < strides.size(); ++i)
|
| 192 |
+
{
|
| 193 |
+
if (i >= cls_scores.size() || i >= bbox_preds.size()) continue;
|
| 194 |
+
// Extract necessary data
|
| 195 |
+
int stride = strides[i];
|
| 196 |
+
Mat cls_score = reshapeIfNeeded(cls_scores[i]);
|
| 197 |
+
Mat bbox_pred = reshapeIfNeeded(bbox_preds[i]);
|
| 198 |
+
Mat anchors = anchorsMlvl[i].t();
|
| 199 |
+
|
| 200 |
+
// Softmax activation, projection, and calculate bounding boxes
|
| 201 |
+
bbox_pred = softmaxActivation(bbox_pred);
|
| 202 |
+
bbox_pred = applyProjection(bbox_pred);
|
| 203 |
+
bbox_pred = stride * bbox_pred;
|
| 204 |
+
|
| 205 |
+
const int nms_pre = 1000;
|
| 206 |
+
if (nms_pre > 0 && cls_score.rows > nms_pre)
|
| 207 |
+
{
|
| 208 |
+
preNMS(anchors, bbox_pred, cls_score, nms_pre);
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
Mat bboxes = calculateBoundingBoxes(anchors, bbox_pred);
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
bboxes_mlvl.push_back(bboxes);
|
| 215 |
+
scores_mlvl.push_back(cls_score);
|
| 216 |
+
}
|
| 217 |
+
Mat bboxes;
|
| 218 |
+
Mat scores;
|
| 219 |
+
vconcat(bboxes_mlvl, bboxes);
|
| 220 |
+
vconcat(scores_mlvl, scores);
|
| 221 |
+
|
| 222 |
+
vector<Rect2d> boxesXYXY = bboxMatToRect2d(bboxes);
|
| 223 |
+
vector<int> classIds;
|
| 224 |
+
vector<float> confidences;
|
| 225 |
+
for (size_t i = 0; i < scores.rows; ++i)
|
| 226 |
+
{
|
| 227 |
+
Point maxLoc;
|
| 228 |
+
minMaxLoc(scores.row(i), nullptr, nullptr, nullptr, &maxLoc);
|
| 229 |
+
classIds.push_back(maxLoc.x);
|
| 230 |
+
confidences.push_back(scores.at<float>(i, maxLoc.x));
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
vector<int> indices;
|
| 234 |
+
NMSBoxesBatched(boxesXYXY, confidences, classIds, probThreshold, iouThreshold, indices);
|
| 235 |
+
Mat result(int(indices.size()), 6, CV_32FC1);
|
| 236 |
+
int row = 0;
|
| 237 |
+
for (auto idx : indices)
|
| 238 |
+
{
|
| 239 |
+
bboxes.rowRange(idx, idx + 1).copyTo(result(Rect(0, row, 4, 1)));
|
| 240 |
+
result.at<float>(row, 4) = confidences[idx];
|
| 241 |
+
result.at<float>(row, 5) = static_cast<float>(classIds[idx]);
|
| 242 |
+
row++;
|
| 243 |
+
}
|
| 244 |
+
if (indices.size() == 0)
|
| 245 |
+
{
|
| 246 |
+
return Mat();
|
| 247 |
+
}
|
| 248 |
+
return result;
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
void generateAnchors()
|
| 252 |
+
{
|
| 253 |
+
for (const int stride : strides) {
|
| 254 |
+
int feat_h = this->imageShape.height / stride;
|
| 255 |
+
int feat_w = this->imageShape.width / stride;
|
| 256 |
+
|
| 257 |
+
vector<Mat> anchors;
|
| 258 |
+
|
| 259 |
+
for (int y = 0; y < feat_h; ++y)
|
| 260 |
+
{
|
| 261 |
+
for (int x = 0; x < feat_w; ++x)
|
| 262 |
+
{
|
| 263 |
+
float shift_x = x * stride;
|
| 264 |
+
float shift_y = y * stride;
|
| 265 |
+
float cx = shift_x + 0.5 * (stride - 1);
|
| 266 |
+
float cy = shift_y + 0.5 * (stride - 1);
|
| 267 |
+
Mat anchor_point = (Mat_<float>(2, 1) << cx, cy);
|
| 268 |
+
anchors.push_back(anchor_point);
|
| 269 |
+
}
|
| 270 |
+
}
|
| 271 |
+
Mat anchors_mat;
|
| 272 |
+
hconcat(anchors, anchors_mat);
|
| 273 |
+
this->anchorsMlvl.push_back(anchors_mat);
|
| 274 |
+
}
|
| 275 |
+
}
|
| 276 |
+
private:
|
| 277 |
+
Net net;
|
| 278 |
+
String modelPath;
|
| 279 |
+
vector<int> strides;
|
| 280 |
+
Size imageShape;
|
| 281 |
+
int regMax;
|
| 282 |
+
float probThreshold;
|
| 283 |
+
float iouThreshold;
|
| 284 |
+
Backend backendId;
|
| 285 |
+
Target targetId;
|
| 286 |
+
Mat project;
|
| 287 |
+
Scalar mean;
|
| 288 |
+
Scalar std;
|
| 289 |
+
vector<Mat> anchorsMlvl;
|
| 290 |
+
};
|
| 291 |
+
|
| 292 |
+
// Function to resize and pad an image and return both the image and scale information
|
| 293 |
+
tuple<Mat, vector<double>> letterbox(const Mat& sourceImage, const Size& target_size = Size(416, 416))
|
| 294 |
+
{
|
| 295 |
+
Mat img = sourceImage.clone();
|
| 296 |
+
|
| 297 |
+
double top = 0, left = 0, newh = target_size.height, neww = target_size.width;
|
| 298 |
+
|
| 299 |
+
if (img.rows != img.cols)
|
| 300 |
+
{
|
| 301 |
+
double hw_scale = static_cast<double>(img.rows) / img.cols;
|
| 302 |
+
if (hw_scale > 1)
|
| 303 |
+
{
|
| 304 |
+
newh = target_size.height;
|
| 305 |
+
neww = static_cast<int>(target_size.width / hw_scale);
|
| 306 |
+
resize(img, img, Size(neww, newh), 0, 0, INTER_AREA);
|
| 307 |
+
left = static_cast<int>((target_size.width - neww) * 0.5);
|
| 308 |
+
copyMakeBorder(img, img, 0, 0, left, target_size.width - neww - left, BORDER_CONSTANT, Scalar(0));
|
| 309 |
+
}
|
| 310 |
+
else
|
| 311 |
+
{
|
| 312 |
+
newh = static_cast<int>(target_size.height * hw_scale);
|
| 313 |
+
neww = target_size.width;
|
| 314 |
+
resize(img, img, Size(neww, newh), 0, 0, INTER_AREA);
|
| 315 |
+
top = static_cast<int>((target_size.height - newh) * 0.5);
|
| 316 |
+
copyMakeBorder(img, img, top, target_size.height - newh - top, 0, 0, BORDER_CONSTANT, Scalar(0));
|
| 317 |
+
}
|
| 318 |
+
}
|
| 319 |
+
else
|
| 320 |
+
{
|
| 321 |
+
resize(img, img, target_size, 0, 0, INTER_AREA);
|
| 322 |
+
}
|
| 323 |
+
vector<double> letterbox_scale = {top, left, newh, neww};
|
| 324 |
+
|
| 325 |
+
return make_tuple(img, letterbox_scale);
|
| 326 |
+
}
|
| 327 |
+
|
| 328 |
+
// Function to scale bounding boxes back to original image coordinates
|
| 329 |
+
vector<int> unletterbox(const Mat& bbox, const Size& original_image_shape, const vector<double>& letterbox_scale)
|
| 330 |
+
{
|
| 331 |
+
vector<int> ret(bbox.cols);
|
| 332 |
+
|
| 333 |
+
int h = original_image_shape.height;
|
| 334 |
+
int w = original_image_shape.width;
|
| 335 |
+
double top = letterbox_scale[0];
|
| 336 |
+
double left = letterbox_scale[1];
|
| 337 |
+
double newh = letterbox_scale[2];
|
| 338 |
+
double neww = letterbox_scale[3];
|
| 339 |
+
|
| 340 |
+
if (h == w)
|
| 341 |
+
{
|
| 342 |
+
double ratio = static_cast<double>(h) / newh;
|
| 343 |
+
for (int& val : ret)
|
| 344 |
+
{
|
| 345 |
+
val = static_cast<int>(val * ratio);
|
| 346 |
+
}
|
| 347 |
+
return ret;
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
double ratioh = static_cast<double>(h) / newh;
|
| 351 |
+
double ratiow = static_cast<double>(w) / neww;
|
| 352 |
+
ret[0] = max(static_cast<int>((bbox.at<float>(0) - left) * ratiow), 0);
|
| 353 |
+
ret[1] = max(static_cast<int>((bbox.at<float>(1) - top) * ratioh), 0);
|
| 354 |
+
ret[2] = min(static_cast<int>((bbox.at<float>(2) - left) * ratiow), w);
|
| 355 |
+
ret[3] = min(static_cast<int>((bbox.at<float>(3) - top) * ratioh), h);
|
| 356 |
+
|
| 357 |
+
return ret;
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
// Function to visualize predictions on an image
|
| 361 |
+
Mat visualize(const Mat& preds, const Mat& result_image, const vector<double>& letterbox_scale, bool video, double fps = 0.0)
|
| 362 |
+
{
|
| 363 |
+
Mat visualized_image = result_image.clone();
|
| 364 |
+
|
| 365 |
+
// Draw FPS if provided
|
| 366 |
+
if (fps > 0.0 && video)
|
| 367 |
+
{
|
| 368 |
+
std::ostringstream fps_stream;
|
| 369 |
+
fps_stream << "FPS: " << std::fixed << std::setprecision(2) << fps;
|
| 370 |
+
putText(visualized_image, fps_stream.str(), Point(10, 25), FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 0, 255), 2);
|
| 371 |
+
}
|
| 372 |
+
|
| 373 |
+
// Draw bounding boxes and labels for each prediction
|
| 374 |
+
for (size_t i = 0; i < preds.rows; i++)
|
| 375 |
+
{
|
| 376 |
+
Mat pred = preds.row(i);
|
| 377 |
+
Mat bbox = pred.colRange(0, 4);
|
| 378 |
+
double conf = pred.at<float>(4);
|
| 379 |
+
int classid = static_cast<int>(pred.at<float>(5));
|
| 380 |
+
|
| 381 |
+
// Convert bbox coordinates back to original image space
|
| 382 |
+
vector<int> unnormalized_bbox = unletterbox(bbox, visualized_image.size(), letterbox_scale);
|
| 383 |
+
|
| 384 |
+
// Draw bounding box
|
| 385 |
+
rectangle(visualized_image, Point(unnormalized_bbox[0], unnormalized_bbox[1]),
|
| 386 |
+
Point(unnormalized_bbox[2], unnormalized_bbox[3]), Scalar(0, 255, 0), 2);
|
| 387 |
+
|
| 388 |
+
// Draw label
|
| 389 |
+
stringstream label;
|
| 390 |
+
label << nanodetClassLabels[classid] << ": " << fixed << setprecision(2) << conf;
|
| 391 |
+
putText(visualized_image, label.str(), Point(unnormalized_bbox[0], unnormalized_bbox[1] - 10),
|
| 392 |
+
FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 255, 0), 2);
|
| 393 |
+
}
|
| 394 |
+
|
| 395 |
+
return visualized_image;
|
| 396 |
+
}
|
| 397 |
+
|
| 398 |
+
void processImage(Mat& inputImage, NanoDet& nanodet, TickMeter& tm, bool save, bool vis, bool video)
|
| 399 |
+
{
|
| 400 |
+
cvtColor(inputImage, inputImage, COLOR_BGR2RGB);
|
| 401 |
+
tuple<Mat, vector<double>> w = letterbox(inputImage);
|
| 402 |
+
Mat inputBlob = get<0>(w);
|
| 403 |
+
vector<double> letterboxScale = get<1>(w);
|
| 404 |
+
|
| 405 |
+
tm.start();
|
| 406 |
+
Mat predictions = nanodet.infer(inputBlob);
|
| 407 |
+
tm.stop();
|
| 408 |
+
if (!video)
|
| 409 |
+
{
|
| 410 |
+
cout << "Inference time: " << tm.getTimeMilli() << " ms\n";
|
| 411 |
+
}
|
| 412 |
+
|
| 413 |
+
Mat img = visualize(predictions, inputImage, letterboxScale, video, tm.getFPS());
|
| 414 |
+
cvtColor(img, img, COLOR_BGR2RGB);
|
| 415 |
+
if (save)
|
| 416 |
+
{
|
| 417 |
+
static const string kOutputName = "result.jpg";
|
| 418 |
+
imwrite(kOutputName, img);
|
| 419 |
+
if (!video)
|
| 420 |
+
{
|
| 421 |
+
cout << "Results saved to " + kOutputName << endl;
|
| 422 |
+
}
|
| 423 |
+
}
|
| 424 |
+
if (vis)
|
| 425 |
+
{
|
| 426 |
+
static const string kWinName = "model";
|
| 427 |
+
imshow(kWinName, img);
|
| 428 |
+
}
|
| 429 |
+
}
|
| 430 |
+
|
| 431 |
+
|
| 432 |
+
const String keys =
|
| 433 |
+
"{ help h | | Print help message. }"
|
| 434 |
+
"{ model m | object_detection_nanodet_2022nov.onnx | Usage: Path to the model, defaults to object_detection_nanodet_2022nov.onnx }"
|
| 435 |
+
"{ input i | | Path to the input image. Omit for using the default camera.}"
|
| 436 |
+
"{ confidence | 0.35 | Class confidence }"
|
| 437 |
+
"{ nms | 0.6 | Enter nms IOU threshold }"
|
| 438 |
+
"{ save s | true | Specify to save results. This flag is invalid when using the camera. }"
|
| 439 |
+
"{ vis v | true | Specify to open a window for result visualization. This flag is invalid when using the camera. }"
|
| 440 |
+
"{ backend bt | 0 | Choose one of computation backends: "
|
| 441 |
+
"0: (default) OpenCV implementation + CPU, "
|
| 442 |
+
"1: CUDA + GPU (CUDA), "
|
| 443 |
+
"2: CUDA + GPU (CUDA FP16), "
|
| 444 |
+
"3: TIM-VX + NPU, "
|
| 445 |
+
"4: CANN + NPU}";
|
| 446 |
+
|
| 447 |
+
int main(int argc, char** argv)
|
| 448 |
+
{
|
| 449 |
+
CommandLineParser parser(argc, argv, keys);
|
| 450 |
+
|
| 451 |
+
parser.about("Use this script to run Nanodet inference using OpenCV, a contribution by Sri Siddarth Chakaravarthy as part of GSOC_2022.");
|
| 452 |
+
if (parser.has("help"))
|
| 453 |
+
{
|
| 454 |
+
parser.printMessage();
|
| 455 |
+
return 0;
|
| 456 |
+
}
|
| 457 |
+
|
| 458 |
+
string model = parser.get<String>("model");
|
| 459 |
+
string inputPath = parser.get<String>("input");
|
| 460 |
+
float confThreshold = parser.get<float>("confidence");
|
| 461 |
+
float nmsThreshold = parser.get<float>("nms");
|
| 462 |
+
bool save = parser.get<bool>("save");
|
| 463 |
+
bool vis = parser.get<bool>("vis");
|
| 464 |
+
int backendTargetid = parser.get<int>("backend");
|
| 465 |
+
|
| 466 |
+
if (model.empty())
|
| 467 |
+
{
|
| 468 |
+
CV_Error(Error::StsError, "Model file " + model + " not found");
|
| 469 |
+
}
|
| 470 |
+
|
| 471 |
+
NanoDet nanodet(model, confThreshold, nmsThreshold,
|
| 472 |
+
backendTargetPairs[backendTargetid].first, backendTargetPairs[backendTargetid].second);
|
| 473 |
+
|
| 474 |
+
TickMeter tm;
|
| 475 |
+
if (parser.has("input"))
|
| 476 |
+
{
|
| 477 |
+
Mat inputImage = imread(samples::findFile(inputPath));
|
| 478 |
+
static const bool kNotVideo = false;
|
| 479 |
+
processImage(inputImage, nanodet, tm, save, vis, kNotVideo);
|
| 480 |
+
waitKey(0);
|
| 481 |
+
}
|
| 482 |
+
else
|
| 483 |
+
{
|
| 484 |
+
VideoCapture cap;
|
| 485 |
+
cap.open(0);
|
| 486 |
+
if (!cap.isOpened())
|
| 487 |
+
{
|
| 488 |
+
CV_Error(Error::StsError, "Cannot open video or file");
|
| 489 |
+
}
|
| 490 |
+
|
| 491 |
+
Mat frame;
|
| 492 |
+
while (waitKey(1) < 0)
|
| 493 |
+
{
|
| 494 |
+
cap >> frame;
|
| 495 |
+
if (frame.empty())
|
| 496 |
+
{
|
| 497 |
+
cout << "Frame is empty" << endl;
|
| 498 |
+
waitKey();
|
| 499 |
+
break;
|
| 500 |
+
}
|
| 501 |
+
tm.reset();
|
| 502 |
+
static const bool kIsVideo = true;
|
| 503 |
+
processImage(frame, nanodet, tm, save, vis, kIsVideo);
|
| 504 |
+
}
|
| 505 |
+
cap.release();
|
| 506 |
+
}
|
| 507 |
+
return 0;
|
| 508 |
+
}
|