Visual Servoing Platform  version 3.5.1 under development (2023-05-12)
vpDetectorDNN.cpp
1 /****************************************************************************
2  *
3  * ViSP, open source Visual Servoing Platform software.
4  * Copyright (C) 2005 - 2019 by Inria. All rights reserved.
5  *
6  * This software is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  * See the file LICENSE.txt at the root directory of this source
11  * distribution for additional information about the GNU GPL.
12  *
13  * For using ViSP with software that can not be combined with the GNU
14  * GPL, please contact Inria about acquiring a ViSP Professional
15  * Edition License.
16  *
17  * See http://visp.inria.fr for more information.
18  *
19  * This software was developed at:
20  * Inria Rennes - Bretagne Atlantique
21  * Campus Universitaire de Beaulieu
22  * 35042 Rennes Cedex
23  * France
24  *
25  * If you have questions regarding the use of this file, please contact
26  * Inria at visp@inria.fr
27  *
28  * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
29  * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
30  *
31  * Description:
32  * DNN object detection using OpenCV DNN module.
33  *
34  *****************************************************************************/
35 #include <visp3/core/vpConfig.h>
36 
37 #if (VISP_HAVE_OPENCV_VERSION >= 0x030403) && defined(VISP_HAVE_OPENCV_DNN)
38 #include <visp3/core/vpImageConvert.h>
39 #include <visp3/detection/vpDetectorDNN.h>
40 
42  : m_blob(), m_boxes(), m_classIds(), m_confidences(), m_confidenceThreshold(0.5), m_I_color(), m_img(),
43  m_inputSize(300, 300), m_mean(127.5, 127.5, 127.5), m_net(), m_nmsThreshold(0.4f), m_outNames(), m_outs(),
44  m_scaleFactor(2.0 / 255.0), m_swapRB(true)
45 {
46 }
47 
49 
59 {
60  vpImageConvert::convert(I, m_I_color);
61 
62  std::vector<vpRect> boundingBoxes;
63  return detect(m_I_color, boundingBoxes);
64 }
65 
73 bool vpDetectorDNN::detect(const vpImage<vpRGBa> &I, std::vector<vpRect> &boundingBoxes)
74 {
75  vpImageConvert::convert(I, m_img);
76 
77  cv::Size inputSize(m_inputSize.width > 0 ? m_inputSize.width : m_img.cols,
78  m_inputSize.height > 0 ? m_inputSize.height : m_img.rows);
79  cv::dnn::blobFromImage(m_img, m_blob, m_scaleFactor, inputSize, m_mean, m_swapRB, false);
80 
81  m_net.setInput(m_blob);
82  m_net.forward(m_outs, m_outNames);
83 
84  postProcess();
85 
86  boundingBoxes.resize(m_boxesNMS.size());
87  for (size_t i = 0; i < m_boxesNMS.size(); i++) {
88  cv::Rect box = m_boxesNMS[i];
89  boundingBoxes[i] = vpRect(box.x, box.y, box.width, box.height);
90  }
91 
92  m_nb_objects = boundingBoxes.size();
93  m_polygon.resize(boundingBoxes.size());
94  m_message.resize(boundingBoxes.size());
95  for (size_t i = 0; i < boundingBoxes.size(); i++) {
96  std::vector<vpImagePoint> polygon;
97 
98  double x = boundingBoxes[i].getLeft();
99  double y = boundingBoxes[i].getTop();
100  double w = boundingBoxes[i].getWidth();
101  double h = boundingBoxes[i].getHeight();
102 
103  polygon.push_back(vpImagePoint(y, x));
104  polygon.push_back(vpImagePoint(y + h, x));
105  polygon.push_back(vpImagePoint(y + h, x + w));
106  polygon.push_back(vpImagePoint(y, x + w));
107 
108  m_polygon[i] = polygon;
109 
110  std::ostringstream oss;
111  oss << m_classIds[i] << " ; " << m_confidences[i] << " ; " << m_boxes[i];
112  m_message[i] = oss.str();
113  }
114 
115  return !boundingBoxes.empty();
116 }
117 
123 std::vector<vpRect> vpDetectorDNN::getDetectionBBs(bool afterNMS) const
124 {
125  std::vector<vpRect> bbs;
126  if (afterNMS) {
127  bbs.reserve(m_boxesNMS.size());
128  for (size_t i = 0; i < m_boxesNMS.size(); i++) {
129  cv::Rect box = m_boxes[i];
130  bbs.push_back(vpRect(box.x, box.y, box.width, box.height));
131  }
132  } else {
133  bbs.reserve(m_boxes.size());
134  for (size_t i = 0; i < m_boxes.size(); i++) {
135  cv::Rect box = m_boxes[i];
136  bbs.push_back(vpRect(box.x, box.y, box.width, box.height));
137  }
138  }
139 
140  return bbs;
141 }
142 
148 std::vector<int> vpDetectorDNN::getDetectionClassIds(bool afterNMS) const
149 {
150  if (afterNMS) {
151  std::vector<int> classIds;
152  for (size_t i = 0; i < m_indices.size(); i++) {
153  int idx = m_indices[i];
154  classIds.push_back(m_classIds[idx]);
155  }
156  return classIds;
157  }
158 
159  return m_classIds;
160 }
161 
165 std::vector<float> vpDetectorDNN::getDetectionConfidence(bool afterNMS) const
166 {
167  if (afterNMS) {
168  std::vector<float> confidences;
169  for (size_t i = 0; i < m_indices.size(); i++) {
170  int idx = m_indices[i];
171  confidences.push_back(m_confidences[idx]);
172  }
173  return confidences;
174  }
175 
176  return m_confidences;
177 }
178 
179 #if (VISP_HAVE_OPENCV_VERSION == 0x030403)
180 std::vector<cv::String> vpDetectorDNN::getOutputsNames()
181 {
182  static std::vector<cv::String> names;
183  if (names.empty()) {
184  std::vector<int> outLayers = m_net.getUnconnectedOutLayers();
185  std::vector<cv::String> layersNames = m_net.getLayerNames();
186  names.resize(outLayers.size());
187  for (size_t i = 0; i < outLayers.size(); ++i)
188  names[i] = layersNames[outLayers[i] - 1];
189  }
190  return names;
191 }
192 #endif
193 
194 void vpDetectorDNN::postProcess()
195 {
196  // Direct copy from object_detection.cpp OpenCV sample
197  static std::vector<int> outLayers = m_net.getUnconnectedOutLayers();
198  static std::string outLayerType = m_net.getLayer(outLayers[0])->type;
199 
200  m_classIds.clear();
201  m_confidences.clear();
202  m_boxes.clear();
203  if (m_net.getLayer(0)->outputNameToIndex("im_info") != -1) // Faster-RCNN or R-FCN
204  {
205  // Network produces output blob with a shape 1x1xNx7 where N is a number of
206  // detections and an every detection is a vector of values
207  // [batchId, classId, confidence, left, top, right, bottom]
208  CV_Assert(m_outs.size() == 1);
209  float *data = (float *)m_outs[0].data;
210  for (size_t i = 0; i < m_outs[0].total(); i += 7) {
211  float confidence = data[i + 2];
212  if (confidence > m_confidenceThreshold) {
213  int left = (int)data[i + 3];
214  int top = (int)data[i + 4];
215  int right = (int)data[i + 5];
216  int bottom = (int)data[i + 6];
217  int width = right - left + 1;
218  int height = bottom - top + 1;
219  m_classIds.push_back((int)(data[i + 1]) - 1); // Skip 0th background class id.
220  m_boxes.push_back(cv::Rect(left, top, width, height));
221  m_confidences.push_back(confidence);
222  }
223  }
224  } else if (outLayerType == "DetectionOutput") {
225  // Network produces output blob with a shape 1x1xNx7 where N is a number of
226  // detections and an every detection is a vector of values
227  // [batchId, classId, confidence, left, top, right, bottom]
228  CV_Assert(m_outs.size() == 1);
229  float *data = (float *)m_outs[0].data;
230  for (size_t i = 0; i < m_outs[0].total(); i += 7) {
231  float confidence = data[i + 2];
232  if (confidence > m_confidenceThreshold) {
233  int left = (int)(data[i + 3] * m_img.cols);
234  int top = (int)(data[i + 4] * m_img.rows);
235  int right = (int)(data[i + 5] * m_img.cols);
236  int bottom = (int)(data[i + 6] * m_img.rows);
237  int width = right - left + 1;
238  int height = bottom - top + 1;
239  m_classIds.push_back((int)(data[i + 1]) - 1); // Skip 0th background class id.
240  m_boxes.push_back(cv::Rect(left, top, width, height));
241  m_confidences.push_back(confidence);
242  }
243  }
244  } else if (outLayerType == "Region") {
245  for (size_t i = 0; i < m_outs.size(); ++i) {
246  // Network produces output blob with a shape NxC where N is a number of
247  // detected objects and C is a number of classes + 4 where the first 4
248  // numbers are [center_x, center_y, width, height]
249  float *data = (float *)m_outs[i].data;
250  for (int j = 0; j < m_outs[i].rows; ++j, data += m_outs[i].cols) {
251  cv::Mat scores = m_outs[i].row(j).colRange(5, m_outs[i].cols);
252  cv::Point classIdPoint;
253  double confidence;
254  cv::minMaxLoc(scores, 0, &confidence, 0, &classIdPoint);
255  if (confidence > m_confidenceThreshold) {
256  int centerX = (int)(data[0] * m_img.cols);
257  int centerY = (int)(data[1] * m_img.rows);
258  int width = (int)(data[2] * m_img.cols);
259  int height = (int)(data[3] * m_img.rows);
260  int left = centerX - width / 2;
261  int top = centerY - height / 2;
262 
263  m_classIds.push_back(classIdPoint.x);
264  m_confidences.push_back((float)confidence);
265  m_boxes.push_back(cv::Rect(left, top, width, height));
266  }
267  }
268  }
269  } else if (outLayerType == "Identity" || outLayerType == "Softmax") {
270  // In OpenCV 4.5.2, the output of ssd-mobilenet.onnx is parsed as `Softmax`, whereas
271  // in OpenCV 4.5.5, the output is of type `Identity`, and the output order is permuted.
272 
273  // Network produces 2 outputs blob:
274  // - `scores` with dimensions 1xNxC
275  // - 'boxes' with dimensions 1xNx4
276  // where `N` is a number of detections and `C` is the number of classes (with `BACKGROUND` as classId = 0).
277 
278  int scores_index = m_outNames[0] == "scores" ? 0 : 1; // scores output index.
279  int boxes_index = m_outNames[0] == "boxes" ? 0 : 1; // boxes output index.
280 
281  int N = m_outs[scores_index].size[1], C = m_outs[scores_index].size[2];
282 
283  float *confidence = (float *)m_outs[scores_index].data;
284  float *bbox = (float *)m_outs[boxes_index].data;
285 
286  // Loop over all guesses on the output of the network.
287  for (int i = 0; i < N; i++) {
288  uint32_t maxClass = 0;
289  float maxScore = -1000.0f;
290 
291  for (int j = 1; j < C; j++) // ignore background (classId = 0).
292  {
293  const float score = confidence[i * C + j];
294 
295  if (score < m_confidenceThreshold)
296  continue;
297 
298  if (score > maxScore) {
299  maxScore = score;
300  maxClass = j;
301  }
302  }
303 
304  if (maxScore > m_confidenceThreshold) {
305  int left = (int)(bbox[4 * i] * m_img.cols);
306  int top = (int)(bbox[4 * i + 1] * m_img.rows);
307  int right = (int)(bbox[4 * i + 2] * m_img.cols);
308  int bottom = (int)(bbox[4 * i + 3] * m_img.rows);
309  int width = right - left + 1;
310  int height = bottom - top + 1;
311 
312  m_boxes.push_back(cv::Rect(left, top, width, height));
313  m_classIds.push_back(maxClass);
314  m_confidences.push_back(maxScore);
315  }
316  }
317  } else
318  CV_Error(cv::Error::StsNotImplemented, "Unknown output layer type: " + outLayerType);
319 
320  cv::dnn::NMSBoxes(m_boxes, m_confidences, m_confidenceThreshold, m_nmsThreshold, m_indices);
321  m_boxesNMS.resize(m_indices.size());
322  for (size_t i = 0; i < m_indices.size(); ++i) {
323  int idx = m_indices[i];
324  m_boxesNMS[i] = m_boxes[idx];
325  }
326 }
327 
347 void vpDetectorDNN::readNet(const std::string &model, const std::string &config, const std::string &framework)
348 {
349  m_net = cv::dnn::readNet(model, config, framework);
350 #if (VISP_HAVE_OPENCV_VERSION == 0x030403)
351  m_outNames = getOutputsNames();
352 #else
353  m_outNames = m_net.getUnconnectedOutLayersNames();
354 #endif
355 }
356 
362 void vpDetectorDNN::setConfidenceThreshold(float confThreshold) { m_confidenceThreshold = confThreshold; }
363 
370 void vpDetectorDNN::setInputSize(int width, int height)
371 {
372  m_inputSize.width = width;
373  m_inputSize.height = height;
374 }
375 
383 void vpDetectorDNN::setMean(double meanR, double meanG, double meanB) { m_mean = cv::Scalar(meanR, meanG, meanB); }
384 
391 void vpDetectorDNN::setNMSThreshold(float nmsThreshold) { m_nmsThreshold = nmsThreshold; }
392 
399 void vpDetectorDNN::setPreferableBackend(int backendId) { m_net.setPreferableBackend(backendId); }
400 
407 void vpDetectorDNN::setPreferableTarget(int targetId) { m_net.setPreferableTarget(targetId); }
408 
412 void vpDetectorDNN::setScaleFactor(double scaleFactor) { m_scaleFactor = scaleFactor; }
413 
419 void vpDetectorDNN::setSwapRB(bool swapRB) { m_swapRB = swapRB; }
420 
421 #elif !defined(VISP_BUILD_SHARED_LIBS)
422 // Work around to avoid warning: libvisp_core.a(vpDetectorDNN.cpp.o) has no
423 // symbols
424 void dummy_vpDetectorDNN(){};
425 #endif
std::vector< std::string > m_message
Message attached to each object.
std::vector< std::vector< vpImagePoint > > m_polygon
For each object, defines the polygon that contains the object.
size_t m_nb_objects
Number of detected objects.
void setPreferableBackend(int backendId)
std::vector< vpRect > getDetectionBBs(bool afterNMS=true) const
std::vector< float > getDetectionConfidence(bool afterNMS=true) const
void setMean(double meanR, double meanG, double meanB)
void setPreferableTarget(int targetId)
void setInputSize(int width, int height)
void setScaleFactor(double scaleFactor)
void readNet(const std::string &model, const std::string &config="", const std::string &framework="")
virtual ~vpDetectorDNN()
void setNMSThreshold(float nmsThreshold)
void setConfidenceThreshold(float confThreshold)
void setSwapRB(bool swapRB)
std::vector< int > getDetectionClassIds(bool afterNMS=true) const
virtual bool detect(const vpImage< unsigned char > &I)
static void convert(const vpImage< unsigned char > &src, vpImage< vpRGBa > &dest)
Class that defines a 2D point in an image. This class is useful for image processing and stores only ...
Definition: vpImagePoint.h:89
Defines a rectangle in the plane.
Definition: vpRect.h:80