Visual Servoing Platform  version 3.6.1 under development (2024-10-15)
vpDetectorDNNOpenCV.cpp
1 /****************************************************************************
2  *
3  * ViSP, open source Visual Servoing Platform software.
4  * Copyright (C) 2005 - 2023 by Inria. All rights reserved.
5  *
6  * This software is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  * See the file LICENSE.txt at the root directory of this source
11  * distribution for additional information about the GNU GPL.
12  *
13  * For using ViSP with software that can not be combined with the GNU
14  * GPL, please contact Inria about acquiring a ViSP Professional
15  * Edition License.
16  *
17  * See https://visp.inria.fr for more information.
18  *
19  * This software was developed at:
20  * Inria Rennes - Bretagne Atlantique
21  * Campus Universitaire de Beaulieu
22  * 35042 Rennes Cedex
23  * France
24  *
25  * If you have questions regarding the use of this file, please contact
26  * Inria at visp@inria.fr
27  *
28  * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
29  * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
30  *
31  * Description:
32  * DNN object detection using OpenCV DNN module.
33  *
34 *****************************************************************************/
35 #include <visp3/core/vpConfig.h>
36 
37 // Check if std:c++17 or higher
38 #if (VISP_HAVE_OPENCV_VERSION >= 0x030403) && defined(HAVE_OPENCV_DNN) && \
39  ((__cplusplus >= 201703L) || (defined(_MSVC_LANG) && (_MSVC_LANG >= 201703L)))
40 
41 #include <visp3/core/vpImageConvert.h>
42 #include <visp3/detection/vpDetectorDNNOpenCV.h>
43 #include <visp3/core/vpIoTools.h>
44 
45 #include<algorithm>
46 
47 BEGIN_VISP_NAMESPACE
53 std::string vpDetectorDNNOpenCV::getAvailableDnnResultsParsingTypes()
54 {
55  std::string list = "[";
56  for (unsigned int i = 0; i < vpDetectorDNNOpenCV::COUNT - 1; i++) {
57  list += "\"" + dnnResultsParsingTypeToString((vpDetectorDNNOpenCV::DNNResultsParsingType)i) + "\", ";
58  }
59  list += "\"" + dnnResultsParsingTypeToString((vpDetectorDNNOpenCV::DNNResultsParsingType)(vpDetectorDNNOpenCV::COUNT - 1)) + "\"]";
60  return list;
61 }
62 
72 std::string vpDetectorDNNOpenCV::dnnResultsParsingTypeToString(const DNNResultsParsingType &type)
73 {
74  std::string name;
75  switch (type) {
76  case YOLO_V3:
77  name = "yolov3";
78  break;
79  case YOLO_V4:
80  name = "yolov4";
81  break;
82  case YOLO_V5:
83  name = "yolov5";
84  break;
85  case YOLO_V7:
86  name = "yolov7";
87  break;
88  case YOLO_V8:
89  name = "yolov8";
90  break;
91  case FASTER_RCNN:
92  name = "faster-rcnn";
93  break;
94  case SSD_MOBILENET:
95  name = "ssd-mobilenet";
96  break;
97  case RESNET_10:
98  name = "resnet-10";
99  break;
100  case USER_SPECIFIED:
101  name = "user-specified";
102  break;
103  case COUNT:
104  name = "unknown";
105  break;
106  }
107  return name;
108 }
109 
118 vpDetectorDNNOpenCV::DNNResultsParsingType vpDetectorDNNOpenCV::dnnResultsParsingTypeFromString(const std::string &name)
119 {
120  vpDetectorDNNOpenCV::DNNResultsParsingType res(COUNT);
121  bool hasFoundMatch = false;
122  std::string name_lowercase = vpIoTools::toLowerCase(name);
123  for (int id = 0; id < COUNT && !hasFoundMatch; id++) {
124  vpDetectorDNNOpenCV::DNNResultsParsingType temp = (vpDetectorDNNOpenCV::DNNResultsParsingType)id;
125  if (dnnResultsParsingTypeToString(temp) == name_lowercase) {
126  res = temp;
127  hasFoundMatch = true;
128  }
129  }
130  return res;
131 }
132 
143 std::vector<std::string> vpDetectorDNNOpenCV::parseClassNamesFile(const std::string &filename)
144 {
145  return NetConfig::parseClassNamesFile(filename);
146 }
147 
148 vpDetectorDNNOpenCV::vpDetectorDNNOpenCV()
149  : m_applySizeFilterAfterNMS(false), m_blob(), m_I_color(), m_img(),
150  m_net(), m_netConfig(), m_outNames(), m_dnnRes(),
151  m_parsingMethod(vpDetectorDNNOpenCV::postProcess_unimplemented)
152 {
153  setDetectionFilterSizeRatio(m_netConfig.m_filterSizeRatio);
154 }
155 
163 vpDetectorDNNOpenCV::vpDetectorDNNOpenCV(const NetConfig &config, const DNNResultsParsingType &typeParsingMethod, void (*parsingMethod)(DetectionCandidates &, std::vector<cv::Mat> &, const NetConfig &))
164  : m_applySizeFilterAfterNMS(false), m_blob(), m_I_color(), m_img(),
165  m_net(), m_netConfig(config), m_outNames(), m_dnnRes()
166 {
167  setDetectionFilterSizeRatio(m_netConfig.m_filterSizeRatio);
168  setParsingMethod(typeParsingMethod, parsingMethod);
169  if (!m_netConfig.m_modelFilename.empty()) {
170  readNet(m_netConfig.m_modelFilename, m_netConfig.m_modelConfigFilename, m_netConfig.m_framework);
171  }
172 }
173 
174 #ifdef VISP_HAVE_NLOHMANN_JSON
175 
176 using json = nlohmann::json;
177 
184 vpDetectorDNNOpenCV::vpDetectorDNNOpenCV(const std::string &jsonPath, void (*parsingMethod)(DetectionCandidates &, std::vector<cv::Mat> &, const NetConfig &))
185  : m_applySizeFilterAfterNMS(false), m_blob(), m_I_color(), m_img(),
186  m_net(), m_netConfig(), m_outNames(), m_dnnRes()
187 {
188  initFromJSON(jsonPath);
189  setDetectionFilterSizeRatio(m_netConfig.m_filterSizeRatio);
190  setParsingMethod(m_netConfig.m_parsingMethodType, parsingMethod);
191 }
192 
198 void vpDetectorDNNOpenCV::initFromJSON(const std::string &jsonPath)
199 {
200  std::ifstream file(jsonPath);
201  if (!file.good()) {
202  std::stringstream ss;
203  ss << "Problem opening file " << jsonPath << ". Make sure it exists and is readable" << std::endl;
204  throw vpException(vpException::ioError, ss.str());
205  }
206  json j;
207  try {
208  j = json::parse(file);
209  }
210  catch (json::parse_error &e) {
211  std::stringstream msg;
212  msg << "Could not parse JSON file : \n";
213 
214  msg << e.what() << std::endl;
215  msg << "Byte position of error: " << e.byte;
216  throw vpException(vpException::ioError, msg.str());
217  }
218  *this = j; // Call from_json(const json& j, vpDetectorDNN& *this) to read json
219  file.close();
220  readNet(m_netConfig.m_modelFilename, m_netConfig.m_modelConfigFilename, m_netConfig.m_framework);
221 }
222 
228 void vpDetectorDNNOpenCV::saveConfigurationInJSON(const std::string &jsonPath) const
229 {
230  std::ofstream file(jsonPath);
231  const json j = *this;
232  file << j.dump(4);
233  file.close();
234 }
235 #endif
236 
240 vpDetectorDNNOpenCV::~vpDetectorDNNOpenCV() { }
241 
251 bool vpDetectorDNNOpenCV::detect(const vpImage<unsigned char> &I, std::vector<DetectedFeatures2D> &output)
252 {
253  vpImageConvert::convert(I, m_I_color);
254 
255  return detect(m_I_color, output);
256 }
257 
267 bool vpDetectorDNNOpenCV::detect(const vpImage<unsigned char> &I, std::map< std::string, std::vector<DetectedFeatures2D>> &output)
268 {
269  vpImageConvert::convert(I, m_I_color);
270 
271  return detect(m_I_color, output);
272 }
273 
283 bool vpDetectorDNNOpenCV::detect(const vpImage<unsigned char> &I, std::vector< std::pair<std::string, std::vector<DetectedFeatures2D>>> &output)
284 {
285  vpImageConvert::convert(I, m_I_color);
286 
287  return detect(m_I_color, output);
288 }
289 
299 bool vpDetectorDNNOpenCV::detect(const vpImage<vpRGBa> &I, std::vector<DetectedFeatures2D> &output)
300 {
301  vpImageConvert::convert(I, m_img);
302 
303  return detect(m_img, output);
304 }
305 
315 bool vpDetectorDNNOpenCV::detect(const vpImage<vpRGBa> &I, std::map< std::string, std::vector<DetectedFeatures2D>> &output)
316 {
317  vpImageConvert::convert(I, m_img);
318 
319  return detect(m_img, output);
320 }
321 
329 bool vpDetectorDNNOpenCV::detect(const vpImage<vpRGBa> &I, std::vector< std::pair<std::string, std::vector<DetectedFeatures2D>>> &output)
330 {
331  vpImageConvert::convert(I, m_img);
332 
333  return detect(m_img, output);
334 }
335 
343 bool vpDetectorDNNOpenCV::detect(const cv::Mat &I, std::vector<DetectedFeatures2D> &output)
344 {
345  m_img = I;
346  output.clear();
347 
348  cv::Size inputSize(m_netConfig.m_inputSize.width > 0 ? m_netConfig.m_inputSize.width : m_img.cols,
349  m_netConfig.m_inputSize.height > 0 ? m_netConfig.m_inputSize.height : m_img.rows);
350  cv::dnn::blobFromImage(m_img, m_blob, m_netConfig.m_scaleFactor, inputSize, m_netConfig.m_mean, m_netConfig.m_swapRB, false);
351 
352  m_net.setInput(m_blob);
353  try {
354  m_net.forward(m_dnnRes, m_outNames);
355  }
356  catch (const cv::Exception &e) {
357  std::cerr << "Caught an exception trying to run inference:" << std::endl << "\t"
358  << e.what()
359  << "\nCuda and/or GPU driver might not be correctly installed. Setting preferable backend to CPU and trying again." << std::endl;
360  m_net.setPreferableBackend(cv::dnn::DNN_BACKEND_DEFAULT);
361  m_net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
362  m_net.forward(m_dnnRes, m_outNames);
363  }
364 
365  DetectionCandidates proposals;
366  postProcess(proposals);
367  size_t nbClassNames = m_netConfig.m_classNames.size();
368  for (size_t i = 0; i < m_indices.size(); ++i) {
369  int idx = m_indices[i];
370  cv::Rect box = proposals.m_boxes[idx];
371  std::optional<std::string> classname_opt;
372  if (nbClassNames > 0) {
373  classname_opt = m_netConfig.m_classNames[proposals.m_classIds[idx]];
374  }
375  output.emplace_back(box.x, box.x + box.width, box.y, box.y + box.height
376  , proposals.m_classIds[idx], proposals.m_confidences[idx]
377  , classname_opt
378  );
379  }
380 
381  if (m_applySizeFilterAfterNMS) {
382  // removing false detections, based on the bbox sizes
383  output = filterDetectionMultiClassInput(output, m_netConfig.m_filterSizeRatio);
384  }
385 
386  return !output.empty();
387 }
388 
396 bool vpDetectorDNNOpenCV::detect(const cv::Mat &I, std::map< std::string, std::vector<DetectedFeatures2D>> &output)
397 {
398  m_img = I;
399  output.clear();
400 
401  cv::Size inputSize(m_netConfig.m_inputSize.width > 0 ? m_netConfig.m_inputSize.width : m_img.cols,
402  m_netConfig.m_inputSize.height > 0 ? m_netConfig.m_inputSize.height : m_img.rows);
403  cv::dnn::blobFromImage(m_img, m_blob, m_netConfig.m_scaleFactor, inputSize, m_netConfig.m_mean, m_netConfig.m_swapRB, false);
404 
405  m_net.setInput(m_blob);
406  try {
407  m_net.forward(m_dnnRes, m_outNames);
408  }
409  catch (const cv::Exception &e) {
410  std::cerr << "Caught an exception trying to run inference:" << std::endl << "\t"
411  << e.what()
412  << "\nCuda and/or GPU driver might not be correctly installed. Setting preferable backend to CPU and trying again." << std::endl;
413  m_net.setPreferableBackend(cv::dnn::DNN_BACKEND_DEFAULT);
414  m_net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
415  m_net.forward(m_dnnRes, m_outNames);
416  }
417 
418  DetectionCandidates proposals;
419  postProcess(proposals);
420  size_t nbClassNames = m_netConfig.m_classNames.size();
421  for (size_t i = 0; i < m_indices.size(); ++i) {
422  int idx = m_indices[i];
423  cv::Rect box = proposals.m_boxes[idx];
424  std::string classname;
425  if (nbClassNames > 0) {
426  classname = m_netConfig.m_classNames[proposals.m_classIds[idx]];
427  }
428  else {
429  classname = std::to_string(proposals.m_classIds[idx]);
430  }
431  std::optional<std::string> classname_opt = std::optional<std::string>(classname);
432  output[classname].emplace_back(box.x, box.x + box.width, box.y, box.y + box.height
433  , proposals.m_classIds[idx], proposals.m_confidences[idx]
434  , classname_opt
435  );
436  }
437 
438  if (m_applySizeFilterAfterNMS) {
439  output = filterDetectionMultiClassInput(output, m_netConfig.m_filterSizeRatio);
440  }
441 
442  return !output.empty();
443 }
444 
452 bool vpDetectorDNNOpenCV::detect(const cv::Mat &I, std::vector< std::pair<std::string, std::vector<DetectedFeatures2D>>> &output)
453 {
454  std::map< std::string, std::vector<DetectedFeatures2D>> map_output;
455  bool returnStatus = detect(I, map_output);
456  for (auto key_val : map_output) {
457  output.push_back(key_val);
458  }
459  return returnStatus;
460 }
461 
462 #if (VISP_HAVE_OPENCV_VERSION == 0x030403)
468 std::vector<cv::String> vpDetectorDNNOpenCV::getOutputsNames()
469 {
470  static std::vector<cv::String> names;
471  if (names.empty()) {
472  std::vector<int> outLayers = m_net.getUnconnectedOutLayers();
473  std::vector<cv::String> layersNames = m_net.getLayerNames();
474  names.resize(outLayers.size());
475  for (size_t i = 0; i < outLayers.size(); ++i)
476  names[i] = layersNames[outLayers[i] - 1];
477  }
478  return names;
479 }
480 #endif
481 
490 void vpDetectorDNNOpenCV::postProcess(DetectionCandidates &proposals)
491 {
492  switch (m_netConfig.m_parsingMethodType) {
493  case YOLO_V3:
494  case YOLO_V4:
495  postProcess_YoloV3_V4(proposals, m_dnnRes, m_netConfig);
496  break;
497  case YOLO_V5:
498  case YOLO_V7:
499  postProcess_YoloV5_V7(proposals, m_dnnRes, m_netConfig);
500  break;
501  case YOLO_V8:
502  postProcess_YoloV8(proposals, m_dnnRes, m_netConfig);
503  break;
504  case FASTER_RCNN:
505  postProcess_FasterRCNN(proposals, m_dnnRes, m_netConfig);
506  break;
507  case SSD_MOBILENET:
508 #if defined(VISP_BUILD_DEPRECATED_FUNCTIONS)
509  void postProcess_SSD_MobileNet(DetectionCandidates & proposals, std::vector<cv::Mat> &dnnRes, const NetConfig & netConfig);
510 #else
511  // NB: the two SSD-MobileNet DNNs that have been tested worked only
512  // using the ResNet-10 parsing method
513  postProcess_ResNet_10(proposals, m_dnnRes, m_netConfig);
514 #endif
515  break;
516  case RESNET_10:
517  postProcess_ResNet_10(proposals, m_dnnRes, m_netConfig);
518  break;
519  case USER_SPECIFIED:
520  m_parsingMethod(proposals, m_dnnRes, m_netConfig);
521  break;
522  default:
523  throw(vpException(vpException::badValue, "Type of DNN post-processing method not handled."));
524  }
525 
526  m_indices.clear();
527  cv::dnn::NMSBoxes(proposals.m_boxes, proposals.m_confidences, m_netConfig.m_confThreshold, m_netConfig.m_nmsThreshold, m_indices);
528 }
529 
541 std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D>
542 vpDetectorDNNOpenCV::filterDetectionSingleClassInput(const std::vector<DetectedFeatures2D> &detected_features, const double minRatioOfAreaOk)
543 {
544  double meanArea(0.);
545  double originalNumberOfObj = static_cast<double>(detected_features.size());
546  double meanFactor = 1. / originalNumberOfObj;
547 
548  // Computing the average area of the class
549  for (DetectedFeatures2D feature : detected_features) {
550  meanArea += feature.m_bbox.getArea();
551  }
552  meanArea *= meanFactor;
553 
554  // Keeping only the detections that respect the area criterion
555  std::vector<DetectedFeatures2D> filtered_features;
556  for (DetectedFeatures2D feature : detected_features) {
557  if (feature.m_bbox.getArea() >= minRatioOfAreaOk * meanArea && feature.m_bbox.getArea() < meanArea / minRatioOfAreaOk) {
558  filtered_features.push_back(feature);
559  }
560  }
561 
562  return filtered_features;
563 }
564 
575 std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D>
576 vpDetectorDNNOpenCV::filterDetectionMultiClassInput(const std::vector<DetectedFeatures2D> &detected_features, const double minRatioOfAreaOk)
577 {
578 #ifndef DOXYGEN_SHOULD_SKIP_THIS
583  class MeanAreaComputer
584  {
585  private:
586  std::map<int, std::pair<int, double>> m_map_id_pairOccurrencesAreas;
589  std::map<int, double> m_mapMeans;
596  double computeMeanArea(const int &class_id)
597  {
598  return m_map_id_pairOccurrencesAreas[class_id].second / (double)m_map_id_pairOccurrencesAreas[class_id].first;
599  }
600 
601  public:
605  void computeMeans()
606  {
607  for (const auto &classID_pair : m_map_id_pairOccurrencesAreas) {
608  m_mapMeans[classID_pair.first] = computeMeanArea(classID_pair.first);
609  }
610  }
611 
612  double getMean(const int &class_id)
613  {
614  if (m_map_id_pairOccurrencesAreas.find(class_id) == m_map_id_pairOccurrencesAreas.end()) {
615  throw(vpException(vpException::badValue, "[MeanAreaComputer::getMean] Asking for class_id \"" + std::to_string(class_id) + "\" that is not present in m_mapMeans. Did you call computeMeans ?"));
616  }
617  return m_mapMeans[class_id];
618  }
619 
625  void operator()(const DetectedFeatures2D &feature)
626  {
627  int class_id = feature.getClassId();
628  double area = feature.getBoundingBox().getArea();
629  if (m_map_id_pairOccurrencesAreas.find(class_id) == m_map_id_pairOccurrencesAreas.end()) {
630  m_map_id_pairOccurrencesAreas[class_id] = std::pair<int, double>(1, area);
631  }
632  else {
633  std::pair<int, double> prev_state = m_map_id_pairOccurrencesAreas[class_id];
634  m_map_id_pairOccurrencesAreas[class_id] = std::pair<int, double>(prev_state.first + 1, prev_state.second + area);
635  }
636  }
637  };
638 #endif // DOXYGEN_SHOULD_SKIP_THIS
639 
640  // Computing the average area of each class
641  MeanAreaComputer meanComputer;
642  std::for_each(detected_features.begin(), detected_features.end(), meanComputer);
643  meanComputer.computeMeans();
644 
645  // Keeping only the detections that respect the area criterion
646  std::vector<DetectedFeatures2D> filtered_features;
647  for (DetectedFeatures2D feature : detected_features) {
648  double meanArea = meanComputer.getMean(feature.getClassId());
649  if (feature.m_bbox.getArea() >= minRatioOfAreaOk * meanArea
650  && feature.m_bbox.getArea() < meanArea / minRatioOfAreaOk) {
651  filtered_features.push_back(feature);
652  }
653  }
654 
655  return filtered_features;
656 }
657 
667 std::map<std::string, std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D>>
668 vpDetectorDNNOpenCV::filterDetectionMultiClassInput(const std::map< std::string, std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D>> &detected_features, const double minRatioOfAreaOk)
669 {
670  std::map<std::string, std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D>> output;
671  for (auto keyval : detected_features) {
672  output[keyval.first] = filterDetectionSingleClassInput(detected_features.at(keyval.first), minRatioOfAreaOk); // removing false detections
673  }
674  return output;
675 }
676 
690 void vpDetectorDNNOpenCV::postProcess_YoloV3_V4(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig)
691 {
692  size_t nbBatches = dnnRes.size();
693 
694  for (size_t i = 0; i < nbBatches; i++) {
695  // Slightly modify from here: https://github.com/opencv/opencv/blob/8c25a8eb7b10fb50cda323ee6bec68aa1a9ce43c/samples/dnn/object_detection.cpp#L192-L221
696  // Counts the number of proposed detections and the number of data corresponding to 1 detection
697  int num_proposal = dnnRes[i].size[0]; // Number of detections
698  int nout = dnnRes[i].size[1]; // Number of data for each detection
699  if (dnnRes[i].dims > 2) {
700  num_proposal = dnnRes[i].size[1];
701  nout = dnnRes[i].size[2];
702  dnnRes[i] = dnnRes[i].reshape(0, num_proposal);
703  }
704 
705  int n = 0, row_ind = 0;
706  float *pdata = (float *)dnnRes[i].data;
707 
708  // Iterate on the detections to keep only the meaningful ones
709  for (n = 0; n < num_proposal; n++) {
710  float box_score = pdata[4];
711  if (box_score > netConfig.m_confThreshold) {
712  cv::Mat scores = dnnRes[i].row(row_ind).colRange(5, nout);
713  cv::Point classIdPoint;
714  double max_class_score;
715  // Get the value and location of the maximum score
716  cv::minMaxLoc(scores, 0, &max_class_score, 0, &classIdPoint);
717 
718  max_class_score *= box_score;
719 
720  // The detection is kept only if the confidence is greater than the threshold
721  if (max_class_score > netConfig.m_confThreshold) {
722  const int class_idx = classIdPoint.x;
723  float cx = pdata[0] * m_img.cols;
724  float cy = pdata[1] * m_img.rows;
725  float w = pdata[2] * m_img.cols;
726  float h = pdata[3] * m_img.rows;
727 
728  int left = int(cx - 0.5 * w);
729  int top = int(cy - 0.5 * h);
730 
731  proposals.m_confidences.push_back((float)max_class_score);
732  proposals.m_boxes.push_back(cv::Rect(left, top, (int)(w), (int)(h)));
733  proposals.m_classIds.push_back(class_idx);
734  }
735  }
736  row_ind++;
737  pdata += nout;
738  }
739  }
740 }
741 
753 void vpDetectorDNNOpenCV::postProcess_YoloV5_V7(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig)
754 {
755  // Compute the ratio between the original size of the image and the network size to translate network coordinates into
756  // image coordinates
757  float ratioh = (float)m_img.rows / netConfig.m_inputSize.height, ratiow = (float)m_img.cols / netConfig.m_inputSize.width;
758  size_t nbBatches = dnnRes.size();
759 
760  for (size_t i = 0; i < nbBatches; i++) {
761  // Counts the number of proposed detections and the number of data corresponding to 1 detection
762  int num_proposal = dnnRes[i].size[0]; // Number of detections
763  int nout = dnnRes[i].size[1]; // Number of data for each detection
764  if (dnnRes[i].dims > 2) {
765  num_proposal = dnnRes[i].size[1];
766  nout = dnnRes[i].size[2];
767  dnnRes[i] = dnnRes[i].reshape(0, num_proposal);
768  }
769 
770  int n = 0, row_ind = 0;
771  float *pdata = (float *)dnnRes[i].data;
772 
773  // Iterate on the detections to keep only the meaningful ones
774  for (n = 0; n < num_proposal; n++) {
775  float box_score = pdata[4];
776 
777  if (box_score > netConfig.m_confThreshold) {
778  cv::Mat scores = dnnRes[i].row(row_ind).colRange(5, nout);
779  cv::Point classIdPoint;
780  double max_class_score;
781  // Get the value and location of the maximum score
782  cv::minMaxLoc(scores, 0, &max_class_score, 0, &classIdPoint);
783  max_class_score *= box_score;
784 
785  // The detection is kept only if the confidence is greater than the threshold
786  if (max_class_score > netConfig.m_confThreshold) {
787  const int class_idx = classIdPoint.x;
788  float cx = pdata[0] * ratiow;
789  float cy = pdata[1] * ratioh;
790  float w = pdata[2] * ratiow;
791  float h = pdata[3] * ratioh;
792 
793  int left = int(cx - 0.5 * w);
794  int top = int(cy - 0.5 * h);
795 
796  proposals.m_confidences.push_back((float)max_class_score);
797  proposals.m_boxes.push_back(cv::Rect(left, top, (int)(w), (int)(h)));
798  proposals.m_classIds.push_back(class_idx);
799  }
800  }
801  row_ind++;
802  pdata += nout;
803  }
804  }
805 }
806 
818 void vpDetectorDNNOpenCV::postProcess_YoloV8(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig)
819 {
820  // Code adapted from here: https://github.com/JustasBart/yolov8_CPP_Inference_OpenCV_ONNX/blob/minimalistic/inference.cpp
821  // Compute the ratio between the original size of the image and the network size to translate network coordinates into
822  // image coordinates
823  float ratioh = (float)m_img.rows / netConfig.m_inputSize.height, ratiow = (float)m_img.cols / netConfig.m_inputSize.width;
824  size_t nbBatches = dnnRes.size();
825 
826  for (size_t i = 0; i < nbBatches; i++) {
827  // Counts the number of proposed detections and the number of data corresponding to 1 detection
828  int num_proposal = dnnRes[i].size[1]; // Number of detections
829  int nout = dnnRes[i].size[0]; // Number of data for each detection
830  if (dnnRes[i].dims > 2) {
831  num_proposal = dnnRes[i].size[2];
832  nout = dnnRes[i].size[1];
833  dnnRes[i] = dnnRes[i].reshape(0, nout);
834  }
835  cv::transpose(dnnRes[i], dnnRes[i]); // Organise data as YoloV5 i.e. [batchsize][1:num_proposals][1:4+nb_classes]
836 
837  int n = 0, row_ind = 0;
838  float *pdata = (float *)dnnRes[i].data;
839 
840  // Iterate on the detections to keep only the meaningful ones
841  for (n = 0; n < num_proposal; n++) {
842  cv::Mat scores = dnnRes[i].row(row_ind).colRange(4, nout);
843  cv::Point classIdPoint;
844  double max_class_score;
845  // Get the value and location of the maximum score
846  cv::minMaxLoc(scores, 0, &max_class_score, 0, &classIdPoint);
847 
848  // The detection is kept only if the confidence is greater than the threshold
849  if (max_class_score > netConfig.m_confThreshold) {
850  const int class_idx = classIdPoint.x;
851  float cx = pdata[0] * ratiow;
852  float cy = pdata[1] * ratioh;
853  float w = pdata[2] * ratiow;
854  float h = pdata[3] * ratioh;
855 
856  int left = int(cx - 0.5 * w);
857  int top = int(cy - 0.5 * h);
858 
859  proposals.m_confidences.push_back((float)max_class_score);
860  proposals.m_boxes.push_back(cv::Rect(left, top, (int)(w), (int)(h)));
861  proposals.m_classIds.push_back(class_idx);
862  }
863 
864  row_ind++;
865  pdata += nout;
866  }
867  }
868 }
869 
881 void vpDetectorDNNOpenCV::postProcess_FasterRCNN(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig)
882 {
883  // Direct copy from object_detection.cpp OpenCV sample
884  // Faster-RCNN
885 
886  // Network produces output blob with a shape 1x1xNx7 where N is a number of
887  // detections and an every detection is a vector of values
888  // [batchId, classId, confidence, left, top, right, bottom]
889  size_t nbBatches = dnnRes.size();
890  for (size_t j = 0; j < nbBatches; j++) {
891  float *data = (float *)dnnRes[j].data;
892  for (size_t i = 0; i < dnnRes[j].total(); i += 7) {
893  float confidence = data[i + 2];
894  if (confidence > netConfig.m_confThreshold) {
895  int left = (int)(data[i + 3] * m_img.cols);
896  int top = (int)(data[i + 4] * m_img.rows);
897  int right = (int)(data[i + 5] * m_img.cols);
898  int bottom = (int)(data[i + 6] * m_img.rows);
899  int classId = (int)(data[i + 1]);
900 
901  proposals.m_confidences.push_back((float)confidence);
902  proposals.m_boxes.push_back(cv::Rect(left, top, right - left + 1, bottom - top + 1));
903  proposals.m_classIds.push_back(classId);
904  }
905  }
906  }
907 
908 }
909 
910 #if defined(VISP_BUILD_DEPRECATED_FUNCTIONS)
923 void vpDetectorDNNOpenCV::postProcess_SSD_MobileNet(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig)
924 {
925  // Network produces 2 outputs blob:
926  // - `scores` with dimensions 1xNxC
927  // - 'boxes' with dimensions 1xNx4
928  // where `N` is a number of detections and `C` is the number of classes (with `BACKGROUND` as classId = 0).
929 
930  int scores_index = m_outNames[0] == "scores" ? 0 : 1; // scores output index.
931  int boxes_index = m_outNames[0] == "boxes" ? 0 : 1; // boxes output index.
932 
933  int N = dnnRes[scores_index].size[1], C = dnnRes[scores_index].size[2];
934 
935  float *confidence = (float *)dnnRes[scores_index].data;
936  float *bbox = (float *)dnnRes[boxes_index].data;
937 
938  // Loop over all guesses on the output of the network.
939  for (int i = 0; i < N; i++) {
940  uint32_t maxClass = 0;
941  float maxScore = -1000.0f;
942 
943  for (int j = 1; j < C; j++) // ignore background (classId = 0).
944  {
945  const float score = confidence[i * C + j];
946 
947  if (score < netConfig.m_confThreshold)
948  continue;
949 
950  if (score > maxScore) {
951  maxScore = score;
952  maxClass = j;
953  }
954  }
955 
956  if (maxScore > netConfig.m_confThreshold) {
957  int left = (int)(bbox[4 * i] * m_img.cols);
958  int top = (int)(bbox[4 * i + 1] * m_img.rows);
959  int right = (int)(bbox[4 * i + 2] * m_img.cols);
960  int bottom = (int)(bbox[4 * i + 3] * m_img.rows);
961  int width = right - left + 1;
962  int height = bottom - top + 1;
963 
964  int classId = maxClass;
965  proposals.m_confidences.push_back(maxScore);
966  proposals.m_boxes.push_back(cv::Rect(left, top, width, height));
967  proposals.m_classIds.push_back(classId);
968  }
969  }
970 }
971 #endif
972 
984 void vpDetectorDNNOpenCV::postProcess_ResNet_10(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig)
985 {
986  // Direct copy from object_detection.cpp OpenCV sample
987 
988  // Network produces output blob with a shape 1x1xNx7 where N is a number of
989  // detections and an every detection is a vector of values
990  // [batchId, classId, confidence, left, top, right, bottom]
991  CV_Assert(dnnRes.size() == 1);
992  float *data = (float *)dnnRes[0].data;
993  for (size_t i = 0; i < dnnRes[0].total(); i += 7) {
994  float confidence = data[i + 2];
995  if (confidence > netConfig.m_confThreshold) {
996  int left = (int)(data[i + 3] * m_img.cols);
997  int top = (int)(data[i + 4] * m_img.rows);
998  int right = (int)(data[i + 5] * m_img.cols);
999  int bottom = (int)(data[i + 6] * m_img.rows);
1000  int classId = (int)(data[i + 1]) - 1;
1001 
1002  proposals.m_confidences.push_back((float)confidence);
1003  proposals.m_boxes.push_back(cv::Rect(left, top, right - left + 1, bottom - top + 1));
1004  proposals.m_classIds.push_back(classId);
1005  }
1006  }
1007 }
1008 
1017 void vpDetectorDNNOpenCV::postProcess_unimplemented(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig)
1018 {
1019  (void)proposals;
1020  (void)dnnRes;
1021  (void)netConfig;
1022  throw(vpException(vpException::functionNotImplementedError, "vpDetectorDNNOpenCV::postProcess was called with a USER_SPECIFIED DNN but not post processing method was given."));
1023 }
1024 
1044 void vpDetectorDNNOpenCV::readNet(const std::string &model, const std::string &config, const std::string &framework)
1045 {
1046  m_netConfig.m_modelFilename = model;
1047  m_netConfig.m_modelConfigFilename = config;
1048  m_netConfig.m_framework = framework;
1049  m_net = cv::dnn::readNet(model, config, framework);
1050 #if (VISP_HAVE_OPENCV_VERSION == 0x030403)
1051  m_outNames = getOutputsNames();
1052 #else
1053  m_outNames = m_net.getUnconnectedOutLayersNames();
1054 #endif
1055 }
1056 
1063 void vpDetectorDNNOpenCV::setNetConfig(const NetConfig &config)
1064 {
1065  m_netConfig = config;
1066  setDetectionFilterSizeRatio(m_netConfig.m_filterSizeRatio);
1067  setParsingMethod(m_netConfig.m_parsingMethodType);
1068  if (!m_netConfig.m_modelFilename.empty()) {
1069  readNet(m_netConfig.m_modelFilename, m_netConfig.m_modelConfigFilename, m_netConfig.m_framework);
1070  }
1071 }
1072 
1078 void vpDetectorDNNOpenCV::setConfidenceThreshold(const float &confThreshold) { m_netConfig.m_confThreshold = confThreshold; }
1079 
1086 void vpDetectorDNNOpenCV::setNMSThreshold(const float &nmsThreshold) { m_netConfig.m_nmsThreshold = nmsThreshold; }
1087 
1095 void vpDetectorDNNOpenCV::setDetectionFilterSizeRatio(const double &sizeRatio)
1096 {
1097  m_netConfig.m_filterSizeRatio = sizeRatio;
1098  if (m_netConfig.m_filterSizeRatio > std::numeric_limits<double>::epsilon()) {
1099  m_applySizeFilterAfterNMS = true;
1100  }
1101  else {
1102  m_applySizeFilterAfterNMS = false;
1103  }
1104 }
1105 
1112 void vpDetectorDNNOpenCV::setInputSize(const int &width, const int &height)
1113 {
1114  m_netConfig.m_inputSize.width = width;
1115  m_netConfig.m_inputSize.height = height;
1116 }
1117 
1125 void vpDetectorDNNOpenCV::setMean(const double &meanR, const double &meanG, const double &meanB) { m_netConfig.m_mean = cv::Scalar(meanR, meanG, meanB); }
1126 
1133 void vpDetectorDNNOpenCV::setPreferableBackend(const int &backendId) { m_net.setPreferableBackend(backendId); }
1134 
1141 void vpDetectorDNNOpenCV::setPreferableTarget(const int &targetId) { m_net.setPreferableTarget(targetId); }
1142 
1146 void vpDetectorDNNOpenCV::setScaleFactor(const double &scaleFactor)
1147 {
1148  m_netConfig.m_scaleFactor = scaleFactor;
1149  if ((m_netConfig.m_parsingMethodType == YOLO_V7 || m_netConfig.m_parsingMethodType == YOLO_V8) && m_netConfig.m_scaleFactor != 1 / 255.) {
1150  std::cout << "[vpDetectorDNNOpenCV::setParsingMethod] WARNING: scale factor should be 1/255. to normalize pixels value." << std::endl;
1151  }
1152 }
1153 
1159 void vpDetectorDNNOpenCV::setSwapRB(const bool &swapRB) { m_netConfig.m_swapRB = swapRB; }
1160 
1168 void vpDetectorDNNOpenCV::setParsingMethod(const DNNResultsParsingType &typeParsingMethod, void (*parsingMethod)(DetectionCandidates &, std::vector<cv::Mat> &, const NetConfig &))
1169 {
1170  m_netConfig.m_parsingMethodType = typeParsingMethod;
1171  m_parsingMethod = parsingMethod;
1172  if ((m_netConfig.m_parsingMethodType == YOLO_V7 || m_netConfig.m_parsingMethodType == YOLO_V8) && m_netConfig.m_scaleFactor != 1 / 255.) {
1173  m_netConfig.m_scaleFactor = 1 / 255.;
1174  std::cout << "[vpDetectorDNNOpenCV::setParsingMethod] NB: scale factor changed to 1/255. to normalize pixels value." << std::endl;
1175  }
1176 
1177 #if defined(VISP_BUILD_DEPRECATED_FUNCTIONS)
1178  if (m_netConfig.m_parsingMethodType == SSD_MOBILENET) {
1179  std::cout << "[vpDetectorDNNOpenCV::setParsingMethod] WARNING: The chosen type of network is " << dnnResultsParsingTypeToString(m_netConfig.m_parsingMethodType) << " VISP_BUILD_DEPRECATED_FUNCTIONS is set to true." << std::endl;
1180  std::cout << "\tThe parsing method that worked with the networks quoted in the ViSP documentation was postProcess_ResNet_10 instead of postProcess_SSD_MobileNet." << std::endl;
1181  std::cout << "\tIf the SSD-MobileNet network does not seem to work, please try to recompile ViSP setting VISP_BUILD_DEPRECATED_FUNCTIONS as false." << std::endl << std::flush;
1182  }
1183 #endif
1184 }
1185 
1186 END_VISP_NAMESPACE
1187 #elif !defined(VISP_BUILD_SHARED_LIBS)
1188 // Work around to avoid warning: libvisp_core.a(vpDetectorDNNOpenCV.cpp.o) has no symbols
1189 void dummy_vpDetectorDNN() { };
1190 #endif
error that can be emitted by ViSP classes.
Definition: vpException.h:60
@ ioError
I/O error.
Definition: vpException.h:67
@ badValue
Used to indicate that a value is not in the allowed range.
Definition: vpException.h:73
@ functionNotImplementedError
Function not implemented.
Definition: vpException.h:66
static void convert(const vpImage< unsigned char > &src, vpImage< vpRGBa > &dest)
static std::string toLowerCase(const std::string &input)
Return a lower-case version of the string input . Numbers and special characters stay the same.
Definition: vpIoTools.cpp:1339