Visual Servoing Platform  version 3.6.1 under development (2024-04-23)
vpDetectorDNNOpenCV.cpp
1 /****************************************************************************
2  *
3  * ViSP, open source Visual Servoing Platform software.
4  * Copyright (C) 2005 - 2023 by Inria. All rights reserved.
5  *
6  * This software is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  * See the file LICENSE.txt at the root directory of this source
11  * distribution for additional information about the GNU GPL.
12  *
13  * For using ViSP with software that can not be combined with the GNU
14  * GPL, please contact Inria about acquiring a ViSP Professional
15  * Edition License.
16  *
17  * See https://visp.inria.fr for more information.
18  *
19  * This software was developed at:
20  * Inria Rennes - Bretagne Atlantique
21  * Campus Universitaire de Beaulieu
22  * 35042 Rennes Cedex
23  * France
24  *
25  * If you have questions regarding the use of this file, please contact
26  * Inria at visp@inria.fr
27  *
28  * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
29  * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
30  *
31  * Description:
32  * DNN object detection using OpenCV DNN module.
33  *
34 *****************************************************************************/
35 #include <visp3/core/vpConfig.h>
36 
37 // Check if std:c++17 or higher
38 #if (VISP_HAVE_OPENCV_VERSION >= 0x030403) && defined(HAVE_OPENCV_DNN) && \
39  ((__cplusplus >= 201703L) || (defined(_MSVC_LANG) && (_MSVC_LANG >= 201703L)))
40 
41 #include <visp3/core/vpImageConvert.h>
42 #include <visp3/detection/vpDetectorDNNOpenCV.h>
43 #include <visp3/core/vpIoTools.h>
44 
45 #include<algorithm>
51 std::string vpDetectorDNNOpenCV::getAvailableDnnResultsParsingTypes()
52 {
53  std::string list = "[";
54  for (unsigned int i = 0; i < vpDetectorDNNOpenCV::COUNT - 1; i++) {
55  list += "\"" + dnnResultsParsingTypeToString((vpDetectorDNNOpenCV::DNNResultsParsingType)i) + "\", ";
56  }
57  list += "\"" + dnnResultsParsingTypeToString((vpDetectorDNNOpenCV::DNNResultsParsingType)(vpDetectorDNNOpenCV::COUNT - 1)) + "\"]";
58  return list;
59 }
60 
70 std::string vpDetectorDNNOpenCV::dnnResultsParsingTypeToString(const DNNResultsParsingType &type)
71 {
72  std::string name;
73  switch (type) {
74  case YOLO_V3:
75  name = "yolov3";
76  break;
77  case YOLO_V4:
78  name = "yolov4";
79  break;
80  case YOLO_V5:
81  name = "yolov5";
82  break;
83  case YOLO_V7:
84  name = "yolov7";
85  break;
86  case YOLO_V8:
87  name = "yolov8";
88  break;
89  case FASTER_RCNN:
90  name = "faster-rcnn";
91  break;
92  case SSD_MOBILENET:
93  name = "ssd-mobilenet";
94  break;
95  case RESNET_10:
96  name = "resnet-10";
97  break;
98  case USER_SPECIFIED:
99  name = "user-specified";
100  break;
101  case COUNT:
102  name = "unknown";
103  break;
104  }
105  return name;
106 }
107 
116 vpDetectorDNNOpenCV::DNNResultsParsingType vpDetectorDNNOpenCV::dnnResultsParsingTypeFromString(const std::string &name)
117 {
118  vpDetectorDNNOpenCV::DNNResultsParsingType res(COUNT);
119  bool hasFoundMatch = false;
120  std::string name_lowercase = vpIoTools::toLowerCase(name);
121  for (int id = 0; id < COUNT && !hasFoundMatch; id++) {
122  vpDetectorDNNOpenCV::DNNResultsParsingType temp = (vpDetectorDNNOpenCV::DNNResultsParsingType)id;
123  if (dnnResultsParsingTypeToString(temp) == name_lowercase) {
124  res = temp;
125  hasFoundMatch = true;
126  }
127  }
128  return res;
129 }
130 
141 std::vector<std::string> vpDetectorDNNOpenCV::parseClassNamesFile(const std::string &filename)
142 {
143  return NetConfig::parseClassNamesFile(filename);
144 }
145 
146 vpDetectorDNNOpenCV::vpDetectorDNNOpenCV()
147  : m_applySizeFilterAfterNMS(false), m_blob(), m_I_color(), m_img(),
148  m_net(), m_netConfig(), m_outNames(), m_dnnRes(),
149  m_parsingMethod(vpDetectorDNNOpenCV::postProcess_unimplemented)
150 {
151  setDetectionFilterSizeRatio(m_netConfig.m_filterSizeRatio);
152 }
153 
161 vpDetectorDNNOpenCV::vpDetectorDNNOpenCV(const NetConfig &config, const DNNResultsParsingType &typeParsingMethod, void (*parsingMethod)(DetectionCandidates &, std::vector<cv::Mat> &, const NetConfig &))
162  : m_applySizeFilterAfterNMS(false), m_blob(), m_I_color(), m_img(),
163  m_net(), m_netConfig(config), m_outNames(), m_dnnRes()
164 {
165  setDetectionFilterSizeRatio(m_netConfig.m_filterSizeRatio);
166  setParsingMethod(typeParsingMethod, parsingMethod);
167  if (!m_netConfig.m_modelFilename.empty()) {
168  readNet(m_netConfig.m_modelFilename, m_netConfig.m_modelConfigFilename, m_netConfig.m_framework);
169  }
170 }
171 
172 #ifdef VISP_HAVE_NLOHMANN_JSON
173 
174 using json = nlohmann::json;
175 
182 vpDetectorDNNOpenCV::vpDetectorDNNOpenCV(const std::string &jsonPath, void (*parsingMethod)(DetectionCandidates &, std::vector<cv::Mat> &, const NetConfig &))
183  : m_applySizeFilterAfterNMS(false), m_blob(), m_I_color(), m_img(),
184  m_net(), m_netConfig(), m_outNames(), m_dnnRes()
185 {
186  initFromJSON(jsonPath);
187  setDetectionFilterSizeRatio(m_netConfig.m_filterSizeRatio);
188  setParsingMethod(m_netConfig.m_parsingMethodType, parsingMethod);
189 }
190 
196 void vpDetectorDNNOpenCV::initFromJSON(const std::string &jsonPath)
197 {
198  std::ifstream file(jsonPath);
199  if (!file.good()) {
200  std::stringstream ss;
201  ss << "Problem opening file " << jsonPath << ". Make sure it exists and is readable" << std::endl;
202  throw vpException(vpException::ioError, ss.str());
203  }
204  json j;
205  try {
206  j = json::parse(file);
207  }
208  catch (json::parse_error &e) {
209  std::stringstream msg;
210  msg << "Could not parse JSON file : \n";
211 
212  msg << e.what() << std::endl;
213  msg << "Byte position of error: " << e.byte;
214  throw vpException(vpException::ioError, msg.str());
215  }
216  *this = j; // Call from_json(const json& j, vpDetectorDNN& *this) to read json
217  file.close();
218  readNet(m_netConfig.m_modelFilename, m_netConfig.m_modelConfigFilename, m_netConfig.m_framework);
219 }
220 
226 void vpDetectorDNNOpenCV::saveConfigurationInJSON(const std::string &jsonPath) const
227 {
228  std::ofstream file(jsonPath);
229  const json j = *this;
230  file << j.dump(4);
231  file.close();
232 }
233 #endif
234 
238 vpDetectorDNNOpenCV::~vpDetectorDNNOpenCV() { }
239 
249 bool vpDetectorDNNOpenCV::detect(const vpImage<unsigned char> &I, std::vector<DetectedFeatures2D> &output)
250 {
251  vpImageConvert::convert(I, m_I_color);
252 
253  return detect(m_I_color, output);
254 }
255 
265 bool vpDetectorDNNOpenCV::detect(const vpImage<unsigned char> &I, std::map< std::string, std::vector<DetectedFeatures2D>> &output)
266 {
267  vpImageConvert::convert(I, m_I_color);
268 
269  return detect(m_I_color, output);
270 }
271 
281 bool vpDetectorDNNOpenCV::detect(const vpImage<unsigned char> &I, std::vector< std::pair<std::string, std::vector<DetectedFeatures2D>>> &output)
282 {
283  vpImageConvert::convert(I, m_I_color);
284 
285  return detect(m_I_color, output);
286 }
287 
297 bool vpDetectorDNNOpenCV::detect(const vpImage<vpRGBa> &I, std::vector<DetectedFeatures2D> &output)
298 {
299  vpImageConvert::convert(I, m_img);
300 
301  return detect(m_img, output);
302 }
303 
313 bool vpDetectorDNNOpenCV::detect(const vpImage<vpRGBa> &I, std::map< std::string, std::vector<DetectedFeatures2D>> &output)
314 {
315  vpImageConvert::convert(I, m_img);
316 
317  return detect(m_img, output);
318 }
319 
327 bool vpDetectorDNNOpenCV::detect(const vpImage<vpRGBa> &I, std::vector< std::pair<std::string, std::vector<DetectedFeatures2D>>> &output)
328 {
329  vpImageConvert::convert(I, m_img);
330 
331  return detect(m_img, output);
332 }
333 
341 bool vpDetectorDNNOpenCV::detect(const cv::Mat &I, std::vector<DetectedFeatures2D> &output)
342 {
343  m_img = I;
344  output.clear();
345 
346  cv::Size inputSize(m_netConfig.m_inputSize.width > 0 ? m_netConfig.m_inputSize.width : m_img.cols,
347  m_netConfig.m_inputSize.height > 0 ? m_netConfig.m_inputSize.height : m_img.rows);
348  cv::dnn::blobFromImage(m_img, m_blob, m_netConfig.m_scaleFactor, inputSize, m_netConfig.m_mean, m_netConfig.m_swapRB, false);
349 
350  m_net.setInput(m_blob);
351  try {
352  m_net.forward(m_dnnRes, m_outNames);
353  }
354  catch (const cv::Exception &e) {
355  std::cerr << "Caught an exception trying to run inference:" << std::endl << "\t"
356  << e.what()
357  << "\nCuda and/or GPU driver might not be correctly installed. Setting preferable backend to CPU and trying again." << std::endl;
358  m_net.setPreferableBackend(cv::dnn::DNN_BACKEND_DEFAULT);
359  m_net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
360  m_net.forward(m_dnnRes, m_outNames);
361  }
362 
363  DetectionCandidates proposals;
364  postProcess(proposals);
365  size_t nbClassNames = m_netConfig.m_classNames.size();
366  for (size_t i = 0; i < m_indices.size(); ++i) {
367  int idx = m_indices[i];
368  cv::Rect box = proposals.m_boxes[idx];
369  std::optional<std::string> classname_opt;
370  if (nbClassNames > 0) {
371  classname_opt = m_netConfig.m_classNames[proposals.m_classIds[idx]];
372  }
373  output.emplace_back(box.x, box.x + box.width, box.y, box.y + box.height
374  , proposals.m_classIds[idx], proposals.m_confidences[idx]
375  , classname_opt
376  );
377  }
378 
379  if (m_applySizeFilterAfterNMS) {
380  // removing false detections, based on the bbox sizes
381  output = filterDetectionMultiClassInput(output, m_netConfig.m_filterSizeRatio);
382  }
383 
384  return !output.empty();
385 }
386 
394 bool vpDetectorDNNOpenCV::detect(const cv::Mat &I, std::map< std::string, std::vector<DetectedFeatures2D>> &output)
395 {
396  m_img = I;
397  output.clear();
398 
399  cv::Size inputSize(m_netConfig.m_inputSize.width > 0 ? m_netConfig.m_inputSize.width : m_img.cols,
400  m_netConfig.m_inputSize.height > 0 ? m_netConfig.m_inputSize.height : m_img.rows);
401  cv::dnn::blobFromImage(m_img, m_blob, m_netConfig.m_scaleFactor, inputSize, m_netConfig.m_mean, m_netConfig.m_swapRB, false);
402 
403  m_net.setInput(m_blob);
404  try {
405  m_net.forward(m_dnnRes, m_outNames);
406  }
407  catch (const cv::Exception &e) {
408  std::cerr << "Caught an exception trying to run inference:" << std::endl << "\t"
409  << e.what()
410  << "\nCuda and/or GPU driver might not be correctly installed. Setting preferable backend to CPU and trying again." << std::endl;
411  m_net.setPreferableBackend(cv::dnn::DNN_BACKEND_DEFAULT);
412  m_net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
413  m_net.forward(m_dnnRes, m_outNames);
414  }
415 
416  DetectionCandidates proposals;
417  postProcess(proposals);
418  size_t nbClassNames = m_netConfig.m_classNames.size();
419  for (size_t i = 0; i < m_indices.size(); ++i) {
420  int idx = m_indices[i];
421  cv::Rect box = proposals.m_boxes[idx];
422  std::string classname;
423  if (nbClassNames > 0) {
424  classname = m_netConfig.m_classNames[proposals.m_classIds[idx]];
425  }
426  else {
427  classname = std::to_string(proposals.m_classIds[idx]);
428  }
429  std::optional<std::string> classname_opt = std::optional<std::string>(classname);
430  output[classname].emplace_back(box.x, box.x + box.width, box.y, box.y + box.height
431  , proposals.m_classIds[idx], proposals.m_confidences[idx]
432  , classname_opt
433  );
434  }
435 
436  if (m_applySizeFilterAfterNMS) {
437  output = filterDetectionMultiClassInput(output, m_netConfig.m_filterSizeRatio);
438  }
439 
440  return !output.empty();
441 }
442 
450 bool vpDetectorDNNOpenCV::detect(const cv::Mat &I, std::vector< std::pair<std::string, std::vector<DetectedFeatures2D>>> &output)
451 {
452  std::map< std::string, std::vector<DetectedFeatures2D>> map_output;
453  bool returnStatus = detect(I, map_output);
454  for (auto key_val : map_output) {
455  output.push_back(key_val);
456  }
457  return returnStatus;
458 }
459 
460 #if (VISP_HAVE_OPENCV_VERSION == 0x030403)
466 std::vector<cv::String> vpDetectorDNNOpenCV::getOutputsNames()
467 {
468  static std::vector<cv::String> names;
469  if (names.empty()) {
470  std::vector<int> outLayers = m_net.getUnconnectedOutLayers();
471  std::vector<cv::String> layersNames = m_net.getLayerNames();
472  names.resize(outLayers.size());
473  for (size_t i = 0; i < outLayers.size(); ++i)
474  names[i] = layersNames[outLayers[i] - 1];
475  }
476  return names;
477 }
478 #endif
479 
488 void vpDetectorDNNOpenCV::postProcess(DetectionCandidates &proposals)
489 {
490  switch (m_netConfig.m_parsingMethodType) {
491  case YOLO_V3:
492  case YOLO_V4:
493  postProcess_YoloV3_V4(proposals, m_dnnRes, m_netConfig);
494  break;
495  case YOLO_V5:
496  case YOLO_V7:
497  postProcess_YoloV5_V7(proposals, m_dnnRes, m_netConfig);
498  break;
499  case YOLO_V8:
500  postProcess_YoloV8(proposals, m_dnnRes, m_netConfig);
501  break;
502  case FASTER_RCNN:
503  postProcess_FasterRCNN(proposals, m_dnnRes, m_netConfig);
504  break;
505  case SSD_MOBILENET:
506 #if defined(VISP_BUILD_DEPRECATED_FUNCTIONS)
507  void postProcess_SSD_MobileNet(DetectionCandidates & proposals, std::vector<cv::Mat> &dnnRes, const NetConfig & netConfig);
508 #else
509  // NB: the two SSD-MobileNet DNNs that have been tested worked only
510  // using the ResNet-10 parsing method
511  postProcess_ResNet_10(proposals, m_dnnRes, m_netConfig);
512 #endif
513  break;
514  case RESNET_10:
515  postProcess_ResNet_10(proposals, m_dnnRes, m_netConfig);
516  break;
517  case USER_SPECIFIED:
518  m_parsingMethod(proposals, m_dnnRes, m_netConfig);
519  break;
520  default:
521  throw(vpException(vpException::badValue, "Type of DNN post-processing method not handled."));
522  }
523 
524  m_indices.clear();
525  cv::dnn::NMSBoxes(proposals.m_boxes, proposals.m_confidences, m_netConfig.m_confThreshold, m_netConfig.m_nmsThreshold, m_indices);
526 }
527 
539 std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D>
540 vpDetectorDNNOpenCV::filterDetectionSingleClassInput(const std::vector<DetectedFeatures2D> &detected_features, const double minRatioOfAreaOk)
541 {
542  double meanArea(0.);
543  double originalNumberOfObj = static_cast<double>(detected_features.size());
544  double meanFactor = 1. / originalNumberOfObj;
545 
546  // Computing the average area of the class
547  for (DetectedFeatures2D feature : detected_features) {
548  meanArea += feature.m_bbox.getArea();
549  }
550  meanArea *= meanFactor;
551 
552  // Keeping only the detections that respect the area criterion
553  std::vector<DetectedFeatures2D> filtered_features;
554  for (DetectedFeatures2D feature : detected_features) {
555  if (feature.m_bbox.getArea() >= minRatioOfAreaOk * meanArea && feature.m_bbox.getArea() < meanArea / minRatioOfAreaOk) {
556  filtered_features.push_back(feature);
557  }
558  }
559 
560  return filtered_features;
561 }
562 
573 std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D>
574 vpDetectorDNNOpenCV::filterDetectionMultiClassInput(const std::vector<DetectedFeatures2D> &detected_features, const double minRatioOfAreaOk)
575 {
576 #ifndef DOXYGEN_SHOULD_SKIP_THIS
581  class MeanAreaComputer
582  {
583  private:
584  std::map<int, std::pair<int, double>> m_map_id_pairOccurrencesAreas;
587  std::map<int, double> m_mapMeans;
594  double computeMeanArea(const int &class_id)
595  {
596  return m_map_id_pairOccurrencesAreas[class_id].second / (double)m_map_id_pairOccurrencesAreas[class_id].first;
597  }
598 
599  public:
603  void computeMeans()
604  {
605  for (const auto &classID_pair : m_map_id_pairOccurrencesAreas) {
606  m_mapMeans[classID_pair.first] = computeMeanArea(classID_pair.first);
607  }
608  }
609 
610  double getMean(const int &class_id)
611  {
612  if (m_map_id_pairOccurrencesAreas.find(class_id) == m_map_id_pairOccurrencesAreas.end()) {
613  throw(vpException(vpException::badValue, "[MeanAreaComputer::getMean] Asking for class_id \"" + std::to_string(class_id) + "\" that is not present in m_mapMeans. Did you call computeMeans ?"));
614  }
615  return m_mapMeans[class_id];
616  }
617 
623  void operator()(const DetectedFeatures2D &feature)
624  {
625  int class_id = feature.getClassId();
626  double area = feature.getBoundingBox().getArea();
627  if (m_map_id_pairOccurrencesAreas.find(class_id) == m_map_id_pairOccurrencesAreas.end()) {
628  m_map_id_pairOccurrencesAreas[class_id] = std::pair<int, double>(1, area);
629  }
630  else {
631  std::pair<int, double> prev_state = m_map_id_pairOccurrencesAreas[class_id];
632  m_map_id_pairOccurrencesAreas[class_id] = std::pair<int, double>(prev_state.first + 1, prev_state.second + area);
633  }
634  }
635  };
636 #endif // DOXYGEN_SHOULD_SKIP_THIS
637 
638  // Computing the average area of each class
639  MeanAreaComputer meanComputer;
640  std::for_each(detected_features.begin(), detected_features.end(), meanComputer);
641  meanComputer.computeMeans();
642 
643  // Keeping only the detections that respect the area criterion
644  std::vector<DetectedFeatures2D> filtered_features;
645  for (DetectedFeatures2D feature : detected_features) {
646  double meanArea = meanComputer.getMean(feature.getClassId());
647  if (feature.m_bbox.getArea() >= minRatioOfAreaOk * meanArea
648  && feature.m_bbox.getArea() < meanArea / minRatioOfAreaOk) {
649  filtered_features.push_back(feature);
650  }
651  }
652 
653  return filtered_features;
654 }
655 
665 std::map<std::string, std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D>>
666 vpDetectorDNNOpenCV::filterDetectionMultiClassInput(const std::map< std::string, std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D>> &detected_features, const double minRatioOfAreaOk)
667 {
668  std::map<std::string, std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D>> output;
669  for (auto keyval : detected_features) {
670  output[keyval.first] = filterDetectionSingleClassInput(detected_features.at(keyval.first), minRatioOfAreaOk); // removing false detections
671  }
672  return output;
673 }
674 
688 void vpDetectorDNNOpenCV::postProcess_YoloV3_V4(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig)
689 {
690  size_t nbBatches = dnnRes.size();
691 
692  for (size_t i = 0; i < nbBatches; i++) {
693  // Slightly modify from here: https://github.com/opencv/opencv/blob/8c25a8eb7b10fb50cda323ee6bec68aa1a9ce43c/samples/dnn/object_detection.cpp#L192-L221
694  // Counts the number of proposed detections and the number of data corresponding to 1 detection
695  int num_proposal = dnnRes[i].size[0]; // Number of detections
696  int nout = dnnRes[i].size[1]; // Number of data for each detection
697  if (dnnRes[i].dims > 2) {
698  num_proposal = dnnRes[i].size[1];
699  nout = dnnRes[i].size[2];
700  dnnRes[i] = dnnRes[i].reshape(0, num_proposal);
701  }
702 
703  int n = 0, row_ind = 0;
704  float *pdata = (float *)dnnRes[i].data;
705 
706  // Iterate on the detections to keep only the meaningful ones
707  for (n = 0; n < num_proposal; n++) {
708  float box_score = pdata[4];
709  if (box_score > netConfig.m_confThreshold) {
710  cv::Mat scores = dnnRes[i].row(row_ind).colRange(5, nout);
711  cv::Point classIdPoint;
712  double max_class_score;
713  // Get the value and location of the maximum score
714  cv::minMaxLoc(scores, 0, &max_class_score, 0, &classIdPoint);
715 
716  max_class_score *= box_score;
717 
718  // The detection is kept only if the confidence is greater than the threshold
719  if (max_class_score > netConfig.m_confThreshold) {
720  const int class_idx = classIdPoint.x;
721  float cx = pdata[0] * m_img.cols;
722  float cy = pdata[1] * m_img.rows;
723  float w = pdata[2] * m_img.cols;
724  float h = pdata[3] * m_img.rows;
725 
726  int left = int(cx - 0.5 * w);
727  int top = int(cy - 0.5 * h);
728 
729  proposals.m_confidences.push_back((float)max_class_score);
730  proposals.m_boxes.push_back(cv::Rect(left, top, (int)(w), (int)(h)));
731  proposals.m_classIds.push_back(class_idx);
732  }
733  }
734  row_ind++;
735  pdata += nout;
736  }
737  }
738 }
739 
751 void vpDetectorDNNOpenCV::postProcess_YoloV5_V7(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig)
752 {
753  // Compute the ratio between the original size of the image and the network size to translate network coordinates into
754  // image coordinates
755  float ratioh = (float)m_img.rows / netConfig.m_inputSize.height, ratiow = (float)m_img.cols / netConfig.m_inputSize.width;
756  size_t nbBatches = dnnRes.size();
757 
758  for (size_t i = 0; i < nbBatches; i++) {
759  // Counts the number of proposed detections and the number of data corresponding to 1 detection
760  int num_proposal = dnnRes[i].size[0]; // Number of detections
761  int nout = dnnRes[i].size[1]; // Number of data for each detection
762  if (dnnRes[i].dims > 2) {
763  num_proposal = dnnRes[i].size[1];
764  nout = dnnRes[i].size[2];
765  dnnRes[i] = dnnRes[i].reshape(0, num_proposal);
766  }
767 
768  int n = 0, row_ind = 0;
769  float *pdata = (float *)dnnRes[i].data;
770 
771  // Iterate on the detections to keep only the meaningful ones
772  for (n = 0; n < num_proposal; n++) {
773  float box_score = pdata[4];
774 
775  if (box_score > netConfig.m_confThreshold) {
776  cv::Mat scores = dnnRes[i].row(row_ind).colRange(5, nout);
777  cv::Point classIdPoint;
778  double max_class_score;
779  // Get the value and location of the maximum score
780  cv::minMaxLoc(scores, 0, &max_class_score, 0, &classIdPoint);
781  max_class_score *= box_score;
782 
783  // The detection is kept only if the confidence is greater than the threshold
784  if (max_class_score > netConfig.m_confThreshold) {
785  const int class_idx = classIdPoint.x;
786  float cx = pdata[0] * ratiow;
787  float cy = pdata[1] * ratioh;
788  float w = pdata[2] * ratiow;
789  float h = pdata[3] * ratioh;
790 
791  int left = int(cx - 0.5 * w);
792  int top = int(cy - 0.5 * h);
793 
794  proposals.m_confidences.push_back((float)max_class_score);
795  proposals.m_boxes.push_back(cv::Rect(left, top, (int)(w), (int)(h)));
796  proposals.m_classIds.push_back(class_idx);
797  }
798  }
799  row_ind++;
800  pdata += nout;
801  }
802  }
803 }
804 
816 void vpDetectorDNNOpenCV::postProcess_YoloV8(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig)
817 {
818  // Code adapted from here: https://github.com/JustasBart/yolov8_CPP_Inference_OpenCV_ONNX/blob/minimalistic/inference.cpp
819  // Compute the ratio between the original size of the image and the network size to translate network coordinates into
820  // image coordinates
821  float ratioh = (float)m_img.rows / netConfig.m_inputSize.height, ratiow = (float)m_img.cols / netConfig.m_inputSize.width;
822  size_t nbBatches = dnnRes.size();
823 
824  for (size_t i = 0; i < nbBatches; i++) {
825  // Counts the number of proposed detections and the number of data corresponding to 1 detection
826  int num_proposal = dnnRes[i].size[1]; // Number of detections
827  int nout = dnnRes[i].size[0]; // Number of data for each detection
828  if (dnnRes[i].dims > 2) {
829  num_proposal = dnnRes[i].size[2];
830  nout = dnnRes[i].size[1];
831  dnnRes[i] = dnnRes[i].reshape(0, nout);
832  }
833  cv::transpose(dnnRes[i], dnnRes[i]); // Organise data as YoloV5 i.e. [batchsize][1:num_proposals][1:4+nb_classes]
834 
835  int n = 0, row_ind = 0;
836  float *pdata = (float *)dnnRes[i].data;
837 
838  // Iterate on the detections to keep only the meaningful ones
839  for (n = 0; n < num_proposal; n++) {
840  cv::Mat scores = dnnRes[i].row(row_ind).colRange(4, nout);
841  cv::Point classIdPoint;
842  double max_class_score;
843  // Get the value and location of the maximum score
844  cv::minMaxLoc(scores, 0, &max_class_score, 0, &classIdPoint);
845 
846  // The detection is kept only if the confidence is greater than the threshold
847  if (max_class_score > netConfig.m_confThreshold) {
848  const int class_idx = classIdPoint.x;
849  float cx = pdata[0] * ratiow;
850  float cy = pdata[1] * ratioh;
851  float w = pdata[2] * ratiow;
852  float h = pdata[3] * ratioh;
853 
854  int left = int(cx - 0.5 * w);
855  int top = int(cy - 0.5 * h);
856 
857  proposals.m_confidences.push_back((float)max_class_score);
858  proposals.m_boxes.push_back(cv::Rect(left, top, (int)(w), (int)(h)));
859  proposals.m_classIds.push_back(class_idx);
860  }
861 
862  row_ind++;
863  pdata += nout;
864  }
865  }
866 }
867 
879 void vpDetectorDNNOpenCV::postProcess_FasterRCNN(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig)
880 {
881  // Direct copy from object_detection.cpp OpenCV sample
882  // Faster-RCNN
883 
884  // Network produces output blob with a shape 1x1xNx7 where N is a number of
885  // detections and an every detection is a vector of values
886  // [batchId, classId, confidence, left, top, right, bottom]
887  size_t nbBatches = dnnRes.size();
888  for (size_t j = 0; j < nbBatches; j++) {
889  float *data = (float *)dnnRes[j].data;
890  for (size_t i = 0; i < dnnRes[j].total(); i += 7) {
891  float confidence = data[i + 2];
892  if (confidence > netConfig.m_confThreshold) {
893  int left = (int)(data[i + 3] * m_img.cols);
894  int top = (int)(data[i + 4] * m_img.rows);
895  int right = (int)(data[i + 5] * m_img.cols);
896  int bottom = (int)(data[i + 6] * m_img.rows);
897  int classId = (int)(data[i + 1]);
898 
899  proposals.m_confidences.push_back((float)confidence);
900  proposals.m_boxes.push_back(cv::Rect(left, top, right - left + 1, bottom - top + 1));
901  proposals.m_classIds.push_back(classId);
902  }
903  }
904  }
905 
906 }
907 
908 #if defined(VISP_BUILD_DEPRECATED_FUNCTIONS)
921 void vpDetectorDNNOpenCV::postProcess_SSD_MobileNet(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig)
922 {
923  // Network produces 2 outputs blob:
924  // - `scores` with dimensions 1xNxC
925  // - 'boxes' with dimensions 1xNx4
926  // where `N` is a number of detections and `C` is the number of classes (with `BACKGROUND` as classId = 0).
927 
928  int scores_index = m_outNames[0] == "scores" ? 0 : 1; // scores output index.
929  int boxes_index = m_outNames[0] == "boxes" ? 0 : 1; // boxes output index.
930 
931  int N = dnnRes[scores_index].size[1], C = dnnRes[scores_index].size[2];
932 
933  float *confidence = (float *)dnnRes[scores_index].data;
934  float *bbox = (float *)dnnRes[boxes_index].data;
935 
936  // Loop over all guesses on the output of the network.
937  for (int i = 0; i < N; i++) {
938  uint32_t maxClass = 0;
939  float maxScore = -1000.0f;
940 
941  for (int j = 1; j < C; j++) // ignore background (classId = 0).
942  {
943  const float score = confidence[i * C + j];
944 
945  if (score < netConfig.m_confThreshold)
946  continue;
947 
948  if (score > maxScore) {
949  maxScore = score;
950  maxClass = j;
951  }
952  }
953 
954  if (maxScore > netConfig.m_confThreshold) {
955  int left = (int)(bbox[4 * i] * m_img.cols);
956  int top = (int)(bbox[4 * i + 1] * m_img.rows);
957  int right = (int)(bbox[4 * i + 2] * m_img.cols);
958  int bottom = (int)(bbox[4 * i + 3] * m_img.rows);
959  int width = right - left + 1;
960  int height = bottom - top + 1;
961 
962  int classId = maxClass;
963  proposals.m_confidences.push_back(maxScore);
964  proposals.m_boxes.push_back(cv::Rect(left, top, width, height));
965  proposals.m_classIds.push_back(classId);
966  }
967  }
968 }
969 #endif
970 
982 void vpDetectorDNNOpenCV::postProcess_ResNet_10(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig)
983 {
984  // Direct copy from object_detection.cpp OpenCV sample
985 
986  // Network produces output blob with a shape 1x1xNx7 where N is a number of
987  // detections and an every detection is a vector of values
988  // [batchId, classId, confidence, left, top, right, bottom]
989  CV_Assert(dnnRes.size() == 1);
990  float *data = (float *)dnnRes[0].data;
991  for (size_t i = 0; i < dnnRes[0].total(); i += 7) {
992  float confidence = data[i + 2];
993  if (confidence > netConfig.m_confThreshold) {
994  int left = (int)(data[i + 3] * m_img.cols);
995  int top = (int)(data[i + 4] * m_img.rows);
996  int right = (int)(data[i + 5] * m_img.cols);
997  int bottom = (int)(data[i + 6] * m_img.rows);
998  int classId = (int)(data[i + 1]) - 1;
999 
1000  proposals.m_confidences.push_back((float)confidence);
1001  proposals.m_boxes.push_back(cv::Rect(left, top, right - left + 1, bottom - top + 1));
1002  proposals.m_classIds.push_back(classId);
1003  }
1004  }
1005 }
1006 
1015 void vpDetectorDNNOpenCV::postProcess_unimplemented(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig)
1016 {
1017  (void)proposals;
1018  (void)dnnRes;
1019  (void)netConfig;
1020  throw(vpException(vpException::functionNotImplementedError, "vpDetectorDNNOpenCV::postProcess was called with a USER_SPECIFIED DNN but not post processing method was given."));
1021 }
1022 
1042 void vpDetectorDNNOpenCV::readNet(const std::string &model, const std::string &config, const std::string &framework)
1043 {
1044  m_netConfig.m_modelFilename = model;
1045  m_netConfig.m_modelConfigFilename = config;
1046  m_netConfig.m_framework = framework;
1047  m_net = cv::dnn::readNet(model, config, framework);
1048 #if (VISP_HAVE_OPENCV_VERSION == 0x030403)
1049  m_outNames = getOutputsNames();
1050 #else
1051  m_outNames = m_net.getUnconnectedOutLayersNames();
1052 #endif
1053 }
1054 
1061 void vpDetectorDNNOpenCV::setNetConfig(const NetConfig &config)
1062 {
1063  m_netConfig = config;
1064  setDetectionFilterSizeRatio(m_netConfig.m_filterSizeRatio);
1065  setParsingMethod(m_netConfig.m_parsingMethodType);
1066  if (!m_netConfig.m_modelFilename.empty()) {
1067  readNet(m_netConfig.m_modelFilename, m_netConfig.m_modelConfigFilename, m_netConfig.m_framework);
1068  }
1069 }
1070 
1076 void vpDetectorDNNOpenCV::setConfidenceThreshold(const float &confThreshold) { m_netConfig.m_confThreshold = confThreshold; }
1077 
1084 void vpDetectorDNNOpenCV::setNMSThreshold(const float &nmsThreshold) { m_netConfig.m_nmsThreshold = nmsThreshold; }
1085 
1093 void vpDetectorDNNOpenCV::setDetectionFilterSizeRatio(const double &sizeRatio)
1094 {
1095  m_netConfig.m_filterSizeRatio = sizeRatio;
1096  if (m_netConfig.m_filterSizeRatio > std::numeric_limits<double>::epsilon()) {
1097  m_applySizeFilterAfterNMS = true;
1098  }
1099  else {
1100  m_applySizeFilterAfterNMS = false;
1101  }
1102 }
1103 
1110 void vpDetectorDNNOpenCV::setInputSize(const int &width, const int &height)
1111 {
1112  m_netConfig.m_inputSize.width = width;
1113  m_netConfig.m_inputSize.height = height;
1114 }
1115 
1123 void vpDetectorDNNOpenCV::setMean(const double &meanR, const double &meanG, const double &meanB) { m_netConfig.m_mean = cv::Scalar(meanR, meanG, meanB); }
1124 
1131 void vpDetectorDNNOpenCV::setPreferableBackend(const int &backendId) { m_net.setPreferableBackend(backendId); }
1132 
1139 void vpDetectorDNNOpenCV::setPreferableTarget(const int &targetId) { m_net.setPreferableTarget(targetId); }
1140 
1144 void vpDetectorDNNOpenCV::setScaleFactor(const double &scaleFactor)
1145 {
1146  m_netConfig.m_scaleFactor = scaleFactor;
1147  if ((m_netConfig.m_parsingMethodType == YOLO_V7 || m_netConfig.m_parsingMethodType == YOLO_V8) && m_netConfig.m_scaleFactor != 1 / 255.) {
1148  std::cout << "[vpDetectorDNNOpenCV::setParsingMethod] WARNING: scale factor should be 1/255. to normalize pixels value." << std::endl;
1149  }
1150 }
1151 
1157 void vpDetectorDNNOpenCV::setSwapRB(const bool &swapRB) { m_netConfig.m_swapRB = swapRB; }
1158 
1166 void vpDetectorDNNOpenCV::setParsingMethod(const DNNResultsParsingType &typeParsingMethod, void (*parsingMethod)(DetectionCandidates &, std::vector<cv::Mat> &, const NetConfig &))
1167 {
1168  m_netConfig.m_parsingMethodType = typeParsingMethod;
1169  m_parsingMethod = parsingMethod;
1170  if ((m_netConfig.m_parsingMethodType == YOLO_V7 || m_netConfig.m_parsingMethodType == YOLO_V8) && m_netConfig.m_scaleFactor != 1 / 255.) {
1171  m_netConfig.m_scaleFactor = 1 / 255.;
1172  std::cout << "[vpDetectorDNNOpenCV::setParsingMethod] NB: scale factor changed to 1/255. to normalize pixels value." << std::endl;
1173  }
1174 
1175 #if defined(VISP_BUILD_DEPRECATED_FUNCTIONS)
1176  if (m_netConfig.m_parsingMethodType == SSD_MOBILENET) {
1177  std::cout << "[vpDetectorDNNOpenCV::setParsingMethod] WARNING: The chosen type of network is " << dnnResultsParsingTypeToString(m_netConfig.m_parsingMethodType) << " VISP_BUILD_DEPRECATED_FUNCTIONS is set to true." << std::endl;
1178  std::cout << "\tThe parsing method that worked with the networks quoted in the ViSP documentation was postProcess_ResNet_10 instead of postProcess_SSD_MobileNet." << std::endl;
1179  std::cout << "\tIf the SSD-MobileNet network does not seem to work, please try to recompile ViSP setting VISP_BUILD_DEPRECATED_FUNCTIONS as false." << std::endl << std::flush;
1180  }
1181 #endif
1182 }
1183 
1184 #elif !defined(VISP_BUILD_SHARED_LIBS)
1185 // Work around to avoid warning: libvisp_core.a(vpDetectorDNNOpenCV.cpp.o) has no symbols
1186 void dummy_vpDetectorDNN() { };
1187 #endif
error that can be emitted by ViSP classes.
Definition: vpException.h:59
@ ioError
I/O error.
Definition: vpException.h:79
@ badValue
Used to indicate that a value is not in the allowed range.
Definition: vpException.h:85
@ functionNotImplementedError
Function not implemented.
Definition: vpException.h:78
static void convert(const vpImage< unsigned char > &src, vpImage< vpRGBa > &dest)
static std::string toLowerCase(const std::string &input)
Return a lower-case version of the string input . Numbers and special characters stay the same.
Definition: vpIoTools.cpp:2111