Visual Servoing Platform  version 3.6.1 under development (2024-11-15)
vpDetectorDNNOpenCV.cpp
1 /*
2  * ViSP, open source Visual Servoing Platform software.
3  * Copyright (C) 2005 - 2024 by Inria. All rights reserved.
4  *
5  * This software is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  * See the file LICENSE.txt at the root directory of this source
10  * distribution for additional information about the GNU GPL.
11  *
12  * For using ViSP with software that can not be combined with the GNU
13  * GPL, please contact Inria about acquiring a ViSP Professional
14  * Edition License.
15  *
16  * See https://visp.inria.fr for more information.
17  *
18  * This software was developed at:
19  * Inria Rennes - Bretagne Atlantique
20  * Campus Universitaire de Beaulieu
21  * 35042 Rennes Cedex
22  * France
23  *
24  * If you have questions regarding the use of this file, please contact
25  * Inria at visp@inria.fr
26  *
27  * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
28  * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
29  *
30  * Description:
31  * DNN object detection using OpenCV DNN module.
32  */
33 
34 #include <visp3/core/vpConfig.h>
35 
36 // Check if std:c++17 or higher
37 #if (VISP_HAVE_OPENCV_VERSION >= 0x030403) && defined(HAVE_OPENCV_DNN) && \
38  ((__cplusplus >= 201703L) || (defined(_MSVC_LANG) && (_MSVC_LANG >= 201703L)))
39 
40 #include <visp3/core/vpImageConvert.h>
41 #include <visp3/detection/vpDetectorDNNOpenCV.h>
42 #include <visp3/core/vpIoTools.h>
43 
44 #include<algorithm>
45 
46 BEGIN_VISP_NAMESPACE
52  std::string vpDetectorDNNOpenCV::getAvailableDnnResultsParsingTypes()
53 {
54  std::string list = "[";
55  for (unsigned int i = 0; i < vpDetectorDNNOpenCV::COUNT - 1; i++) {
56  list += "\"" + dnnResultsParsingTypeToString((vpDetectorDNNOpenCV::DNNResultsParsingType)i) + "\", ";
57  }
58  list += "\"" + dnnResultsParsingTypeToString((vpDetectorDNNOpenCV::DNNResultsParsingType)(vpDetectorDNNOpenCV::COUNT - 1)) + "\"]";
59  return list;
60 }
61 
71 std::string vpDetectorDNNOpenCV::dnnResultsParsingTypeToString(const DNNResultsParsingType &type)
72 {
73  std::string name;
74  switch (type) {
75  case YOLO_V3:
76  name = "yolov3";
77  break;
78  case YOLO_V4:
79  name = "yolov4";
80  break;
81  case YOLO_V5:
82  name = "yolov5";
83  break;
84  case YOLO_V7:
85  name = "yolov7";
86  break;
87  case YOLO_V8:
88  name = "yolov8";
89  break;
90  case YOLO_V11:
91  name = "yolov11";
92  break;
93  case FASTER_RCNN:
94  name = "faster-rcnn";
95  break;
96  case SSD_MOBILENET:
97  name = "ssd-mobilenet";
98  break;
99  case RESNET_10:
100  name = "resnet-10";
101  break;
102  case USER_SPECIFIED:
103  name = "user-specified";
104  break;
105  case COUNT:
106  name = "unknown";
107  break;
108  }
109  return name;
110 }
111 
120 vpDetectorDNNOpenCV::DNNResultsParsingType vpDetectorDNNOpenCV::dnnResultsParsingTypeFromString(const std::string &name)
121 {
122  vpDetectorDNNOpenCV::DNNResultsParsingType res(COUNT);
123  bool hasFoundMatch = false;
124  std::string name_lowercase = vpIoTools::toLowerCase(name);
125  for (int id = 0; id < COUNT && !hasFoundMatch; id++) {
126  vpDetectorDNNOpenCV::DNNResultsParsingType temp = (vpDetectorDNNOpenCV::DNNResultsParsingType)id;
127  if (dnnResultsParsingTypeToString(temp) == name_lowercase) {
128  res = temp;
129  hasFoundMatch = true;
130  }
131  }
132  return res;
133 }
134 
145 std::vector<std::string> vpDetectorDNNOpenCV::parseClassNamesFile(const std::string &filename)
146 {
147  return NetConfig::parseClassNamesFile(filename);
148 }
149 
150 vpDetectorDNNOpenCV::vpDetectorDNNOpenCV()
151  : m_applySizeFilterAfterNMS(false), m_blob(), m_I_color(), m_img(),
152  m_net(), m_netConfig(), m_outNames(), m_dnnRes(),
153  m_parsingMethod(vpDetectorDNNOpenCV::postProcess_unimplemented)
154 {
155  setDetectionFilterSizeRatio(m_netConfig.m_filterSizeRatio);
156 }
157 
165 vpDetectorDNNOpenCV::vpDetectorDNNOpenCV(const NetConfig &config, const DNNResultsParsingType &typeParsingMethod, void (*parsingMethod)(DetectionCandidates &, std::vector<cv::Mat> &, const NetConfig &))
166  : m_applySizeFilterAfterNMS(false), m_blob(), m_I_color(), m_img(),
167  m_net(), m_netConfig(config), m_outNames(), m_dnnRes()
168 {
169  setDetectionFilterSizeRatio(m_netConfig.m_filterSizeRatio);
170  setParsingMethod(typeParsingMethod, parsingMethod);
171  if (!m_netConfig.m_modelFilename.empty()) {
172  readNet(m_netConfig.m_modelFilename, m_netConfig.m_modelConfigFilename, m_netConfig.m_framework);
173  }
174 }
175 
176 #ifdef VISP_HAVE_NLOHMANN_JSON
177 
178 using json = nlohmann::json;
179 
186 vpDetectorDNNOpenCV::vpDetectorDNNOpenCV(const std::string &jsonPath, void (*parsingMethod)(DetectionCandidates &, std::vector<cv::Mat> &, const NetConfig &))
187  : m_applySizeFilterAfterNMS(false), m_blob(), m_I_color(), m_img(),
188  m_net(), m_netConfig(), m_outNames(), m_dnnRes()
189 {
190  initFromJSON(jsonPath);
191  setDetectionFilterSizeRatio(m_netConfig.m_filterSizeRatio);
192  setParsingMethod(m_netConfig.m_parsingMethodType, parsingMethod);
193 }
194 
200 void vpDetectorDNNOpenCV::initFromJSON(const std::string &jsonPath)
201 {
202  std::ifstream file(jsonPath);
203  if (!file.good()) {
204  std::stringstream ss;
205  ss << "Problem opening file " << jsonPath << ". Make sure it exists and is readable" << std::endl;
206  throw vpException(vpException::ioError, ss.str());
207  }
208  json j;
209  try {
210  j = json::parse(file);
211  }
212  catch (json::parse_error &e) {
213  std::stringstream msg;
214  msg << "Could not parse JSON file : \n";
215 
216  msg << e.what() << std::endl;
217  msg << "Byte position of error: " << e.byte;
218  throw vpException(vpException::ioError, msg.str());
219  }
220  *this = j; // Call from_json(const json& j, vpDetectorDNN& *this) to read json
221  file.close();
222  readNet(m_netConfig.m_modelFilename, m_netConfig.m_modelConfigFilename, m_netConfig.m_framework);
223 }
224 
230 void vpDetectorDNNOpenCV::saveConfigurationInJSON(const std::string &jsonPath) const
231 {
232  std::ofstream file(jsonPath);
233  const json j = *this;
234  file << j.dump(4);
235  file.close();
236 }
237 #endif
238 
242 vpDetectorDNNOpenCV::~vpDetectorDNNOpenCV() { }
243 
253 bool vpDetectorDNNOpenCV::detect(const vpImage<unsigned char> &I, std::vector<DetectedFeatures2D> &output)
254 {
255  vpImageConvert::convert(I, m_I_color);
256 
257  return detect(m_I_color, output);
258 }
259 
269 bool vpDetectorDNNOpenCV::detect(const vpImage<unsigned char> &I, std::map< std::string, std::vector<DetectedFeatures2D>> &output)
270 {
271  vpImageConvert::convert(I, m_I_color);
272 
273  return detect(m_I_color, output);
274 }
275 
285 bool vpDetectorDNNOpenCV::detect(const vpImage<unsigned char> &I, std::vector< std::pair<std::string, std::vector<DetectedFeatures2D>>> &output)
286 {
287  vpImageConvert::convert(I, m_I_color);
288 
289  return detect(m_I_color, output);
290 }
291 
301 bool vpDetectorDNNOpenCV::detect(const vpImage<vpRGBa> &I, std::vector<DetectedFeatures2D> &output)
302 {
303  vpImageConvert::convert(I, m_img);
304 
305  return detect(m_img, output);
306 }
307 
317 bool vpDetectorDNNOpenCV::detect(const vpImage<vpRGBa> &I, std::map< std::string, std::vector<DetectedFeatures2D>> &output)
318 {
319  vpImageConvert::convert(I, m_img);
320 
321  return detect(m_img, output);
322 }
323 
331 bool vpDetectorDNNOpenCV::detect(const vpImage<vpRGBa> &I, std::vector< std::pair<std::string, std::vector<DetectedFeatures2D>>> &output)
332 {
333  vpImageConvert::convert(I, m_img);
334 
335  return detect(m_img, output);
336 }
337 
345 bool vpDetectorDNNOpenCV::detect(const cv::Mat &I, std::vector<DetectedFeatures2D> &output)
346 {
347  m_img = I;
348  output.clear();
349 
350  cv::Size inputSize(m_netConfig.m_inputSize.width > 0 ? m_netConfig.m_inputSize.width : m_img.cols,
351  m_netConfig.m_inputSize.height > 0 ? m_netConfig.m_inputSize.height : m_img.rows);
352  cv::dnn::blobFromImage(m_img, m_blob, m_netConfig.m_scaleFactor, inputSize, m_netConfig.m_mean, m_netConfig.m_swapRB, false);
353 
354  m_net.setInput(m_blob);
355  try {
356  m_net.forward(m_dnnRes, m_outNames);
357  }
358  catch (const cv::Exception &e) {
359  std::cerr << "Caught an exception trying to run inference:" << std::endl << "\t"
360  << e.what()
361  << "\nCuda and/or GPU driver might not be correctly installed. Setting preferable backend to CPU and trying again." << std::endl;
362  m_net.setPreferableBackend(cv::dnn::DNN_BACKEND_DEFAULT);
363  m_net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
364  m_net.forward(m_dnnRes, m_outNames);
365  }
366 
367  DetectionCandidates proposals;
368  postProcess(proposals);
369  size_t nbClassNames = m_netConfig.m_classNames.size();
370  for (size_t i = 0; i < m_indices.size(); ++i) {
371  int idx = m_indices[i];
372  cv::Rect box = proposals.m_boxes[idx];
373  std::optional<std::string> classname_opt;
374  if (nbClassNames > 0) {
375  classname_opt = m_netConfig.m_classNames[proposals.m_classIds[idx]];
376  }
377  output.emplace_back(box.x, box.x + box.width, box.y, box.y + box.height
378  , proposals.m_classIds[idx], proposals.m_confidences[idx]
379  , classname_opt
380  );
381  }
382 
383  if (m_applySizeFilterAfterNMS) {
384  // removing false detections, based on the bbox sizes
385  output = filterDetectionMultiClassInput(output, m_netConfig.m_filterSizeRatio);
386  }
387 
388  return !output.empty();
389 }
390 
398 bool vpDetectorDNNOpenCV::detect(const cv::Mat &I, std::map< std::string, std::vector<DetectedFeatures2D>> &output)
399 {
400  m_img = I;
401  output.clear();
402 
403  cv::Size inputSize(m_netConfig.m_inputSize.width > 0 ? m_netConfig.m_inputSize.width : m_img.cols,
404  m_netConfig.m_inputSize.height > 0 ? m_netConfig.m_inputSize.height : m_img.rows);
405  cv::dnn::blobFromImage(m_img, m_blob, m_netConfig.m_scaleFactor, inputSize, m_netConfig.m_mean, m_netConfig.m_swapRB, false);
406 
407  m_net.setInput(m_blob);
408  try {
409  m_net.forward(m_dnnRes, m_outNames);
410  }
411  catch (const cv::Exception &e) {
412  std::cerr << "Caught an exception trying to run inference:" << std::endl << "\t"
413  << e.what()
414  << "\nCuda and/or GPU driver might not be correctly installed. Setting preferable backend to CPU and trying again." << std::endl;
415  m_net.setPreferableBackend(cv::dnn::DNN_BACKEND_DEFAULT);
416  m_net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
417  m_net.forward(m_dnnRes, m_outNames);
418  }
419 
420  DetectionCandidates proposals;
421  postProcess(proposals);
422  size_t nbClassNames = m_netConfig.m_classNames.size();
423  for (size_t i = 0; i < m_indices.size(); ++i) {
424  int idx = m_indices[i];
425  cv::Rect box = proposals.m_boxes[idx];
426  std::string classname;
427  if (nbClassNames > 0) {
428  classname = m_netConfig.m_classNames[proposals.m_classIds[idx]];
429  }
430  else {
431  classname = std::to_string(proposals.m_classIds[idx]);
432  }
433  std::optional<std::string> classname_opt = std::optional<std::string>(classname);
434  output[classname].emplace_back(box.x, box.x + box.width, box.y, box.y + box.height
435  , proposals.m_classIds[idx], proposals.m_confidences[idx]
436  , classname_opt
437  );
438  }
439 
440  if (m_applySizeFilterAfterNMS) {
441  output = filterDetectionMultiClassInput(output, m_netConfig.m_filterSizeRatio);
442  }
443 
444  return !output.empty();
445 }
446 
454 bool vpDetectorDNNOpenCV::detect(const cv::Mat &I, std::vector< std::pair<std::string, std::vector<DetectedFeatures2D>>> &output)
455 {
456  std::map< std::string, std::vector<DetectedFeatures2D>> map_output;
457  bool returnStatus = detect(I, map_output);
458  for (auto key_val : map_output) {
459  output.push_back(key_val);
460  }
461  return returnStatus;
462 }
463 
464 #if (VISP_HAVE_OPENCV_VERSION == 0x030403)
470 std::vector<cv::String> vpDetectorDNNOpenCV::getOutputsNames()
471 {
472  static std::vector<cv::String> names;
473  if (names.empty()) {
474  std::vector<int> outLayers = m_net.getUnconnectedOutLayers();
475  std::vector<cv::String> layersNames = m_net.getLayerNames();
476  names.resize(outLayers.size());
477  for (size_t i = 0; i < outLayers.size(); ++i)
478  names[i] = layersNames[outLayers[i] - 1];
479  }
480  return names;
481 }
482 #endif
483 
492 void vpDetectorDNNOpenCV::postProcess(DetectionCandidates &proposals)
493 {
494  switch (m_netConfig.m_parsingMethodType) {
495  case YOLO_V3:
496  case YOLO_V4:
497  postProcess_YoloV3_V4(proposals, m_dnnRes, m_netConfig);
498  break;
499  case YOLO_V5:
500  case YOLO_V7:
501  postProcess_YoloV5_V7(proposals, m_dnnRes, m_netConfig);
502  break;
503  case YOLO_V8:
504  case YOLO_V11:
505  postProcess_YoloV8_V11(proposals, m_dnnRes, m_netConfig);
506  break;
507  case FASTER_RCNN:
508  postProcess_FasterRCNN(proposals, m_dnnRes, m_netConfig);
509  break;
510  case SSD_MOBILENET:
511 #if defined(VISP_BUILD_DEPRECATED_FUNCTIONS)
512  void postProcess_SSD_MobileNet(DetectionCandidates & proposals, std::vector<cv::Mat> &dnnRes, const NetConfig & netConfig);
513 #else
514  // NB: the two SSD-MobileNet DNNs that have been tested worked only
515  // using the ResNet-10 parsing method
516  postProcess_ResNet_10(proposals, m_dnnRes, m_netConfig);
517 #endif
518  break;
519  case RESNET_10:
520  postProcess_ResNet_10(proposals, m_dnnRes, m_netConfig);
521  break;
522  case USER_SPECIFIED:
523  m_parsingMethod(proposals, m_dnnRes, m_netConfig);
524  break;
525  default:
526  throw(vpException(vpException::badValue, "Type of DNN post-processing method not handled."));
527  }
528 
529  m_indices.clear();
530  cv::dnn::NMSBoxes(proposals.m_boxes, proposals.m_confidences, m_netConfig.m_confThreshold, m_netConfig.m_nmsThreshold, m_indices);
531 }
532 
544 std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D>
545 vpDetectorDNNOpenCV::filterDetectionSingleClassInput(const std::vector<DetectedFeatures2D> &detected_features, const double minRatioOfAreaOk)
546 {
547  double meanArea(0.);
548  double originalNumberOfObj = static_cast<double>(detected_features.size());
549  double meanFactor = 1. / originalNumberOfObj;
550 
551  // Computing the average area of the class
552  for (DetectedFeatures2D feature : detected_features) {
553  meanArea += feature.m_bbox.getArea();
554  }
555  meanArea *= meanFactor;
556 
557  // Keeping only the detections that respect the area criterion
558  std::vector<DetectedFeatures2D> filtered_features;
559  for (DetectedFeatures2D feature : detected_features) {
560  if (feature.m_bbox.getArea() >= minRatioOfAreaOk * meanArea && feature.m_bbox.getArea() < meanArea / minRatioOfAreaOk) {
561  filtered_features.push_back(feature);
562  }
563  }
564 
565  return filtered_features;
566 }
567 
578 std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D>
579 vpDetectorDNNOpenCV::filterDetectionMultiClassInput(const std::vector<DetectedFeatures2D> &detected_features, const double minRatioOfAreaOk)
580 {
581 #ifndef DOXYGEN_SHOULD_SKIP_THIS
586  class MeanAreaComputer
587  {
588  private:
589  std::map<int, std::pair<int, double>> m_map_id_pairOccurrencesAreas;
592  std::map<int, double> m_mapMeans;
599  double computeMeanArea(const int &class_id)
600  {
601  return m_map_id_pairOccurrencesAreas[class_id].second / (double)m_map_id_pairOccurrencesAreas[class_id].first;
602  }
603 
604  public:
608  void computeMeans()
609  {
610  for (const auto &classID_pair : m_map_id_pairOccurrencesAreas) {
611  m_mapMeans[classID_pair.first] = computeMeanArea(classID_pair.first);
612  }
613  }
614 
615  double getMean(const int &class_id)
616  {
617  if (m_map_id_pairOccurrencesAreas.find(class_id) == m_map_id_pairOccurrencesAreas.end()) {
618  throw(vpException(vpException::badValue, "[MeanAreaComputer::getMean] Asking for class_id \"" + std::to_string(class_id) + "\" that is not present in m_mapMeans. Did you call computeMeans ?"));
619  }
620  return m_mapMeans[class_id];
621  }
622 
628  void operator()(const DetectedFeatures2D &feature)
629  {
630  int class_id = feature.getClassId();
631  double area = feature.getBoundingBox().getArea();
632  if (m_map_id_pairOccurrencesAreas.find(class_id) == m_map_id_pairOccurrencesAreas.end()) {
633  m_map_id_pairOccurrencesAreas[class_id] = std::pair<int, double>(1, area);
634  }
635  else {
636  std::pair<int, double> prev_state = m_map_id_pairOccurrencesAreas[class_id];
637  m_map_id_pairOccurrencesAreas[class_id] = std::pair<int, double>(prev_state.first + 1, prev_state.second + area);
638  }
639  }
640  };
641 #endif // DOXYGEN_SHOULD_SKIP_THIS
642 
643  // Computing the average area of each class
644  MeanAreaComputer meanComputer;
645  std::for_each(detected_features.begin(), detected_features.end(), meanComputer);
646  meanComputer.computeMeans();
647 
648  // Keeping only the detections that respect the area criterion
649  std::vector<DetectedFeatures2D> filtered_features;
650  for (DetectedFeatures2D feature : detected_features) {
651  double meanArea = meanComputer.getMean(feature.getClassId());
652  if (feature.m_bbox.getArea() >= minRatioOfAreaOk * meanArea
653  && feature.m_bbox.getArea() < meanArea / minRatioOfAreaOk) {
654  filtered_features.push_back(feature);
655  }
656  }
657 
658  return filtered_features;
659 }
660 
670 std::map<std::string, std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D>>
671 vpDetectorDNNOpenCV::filterDetectionMultiClassInput(const std::map< std::string, std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D>> &detected_features, const double minRatioOfAreaOk)
672 {
673  std::map<std::string, std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D>> output;
674  for (auto keyval : detected_features) {
675  output[keyval.first] = filterDetectionSingleClassInput(detected_features.at(keyval.first), minRatioOfAreaOk); // removing false detections
676  }
677  return output;
678 }
679 
693 void vpDetectorDNNOpenCV::postProcess_YoloV3_V4(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig)
694 {
695  size_t nbBatches = dnnRes.size();
696 
697  for (size_t i = 0; i < nbBatches; i++) {
698  // Slightly modify from here: https://github.com/opencv/opencv/blob/8c25a8eb7b10fb50cda323ee6bec68aa1a9ce43c/samples/dnn/object_detection.cpp#L192-L221
699  // Counts the number of proposed detections and the number of data corresponding to 1 detection
700  int num_proposal = dnnRes[i].size[0]; // Number of detections
701  int nout = dnnRes[i].size[1]; // Number of data for each detection
702  if (dnnRes[i].dims > 2) {
703  num_proposal = dnnRes[i].size[1];
704  nout = dnnRes[i].size[2];
705  dnnRes[i] = dnnRes[i].reshape(0, num_proposal);
706  }
707 
708  int n = 0, row_ind = 0;
709  float *pdata = (float *)dnnRes[i].data;
710 
711  // Iterate on the detections to keep only the meaningful ones
712  for (n = 0; n < num_proposal; n++) {
713  float box_score = pdata[4];
714  if (box_score > netConfig.m_confThreshold) {
715  cv::Mat scores = dnnRes[i].row(row_ind).colRange(5, nout);
716  cv::Point classIdPoint;
717  double max_class_score;
718  // Get the value and location of the maximum score
719  cv::minMaxLoc(scores, 0, &max_class_score, 0, &classIdPoint);
720 
721  max_class_score *= box_score;
722 
723  // The detection is kept only if the confidence is greater than the threshold
724  if (max_class_score > netConfig.m_confThreshold) {
725  const int class_idx = classIdPoint.x;
726  float cx = pdata[0] * m_img.cols;
727  float cy = pdata[1] * m_img.rows;
728  float w = pdata[2] * m_img.cols;
729  float h = pdata[3] * m_img.rows;
730 
731  int left = int(cx - 0.5 * w);
732  int top = int(cy - 0.5 * h);
733 
734  proposals.m_confidences.push_back((float)max_class_score);
735  proposals.m_boxes.push_back(cv::Rect(left, top, (int)(w), (int)(h)));
736  proposals.m_classIds.push_back(class_idx);
737  }
738  }
739  row_ind++;
740  pdata += nout;
741  }
742  }
743 }
744 
756 void vpDetectorDNNOpenCV::postProcess_YoloV5_V7(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig)
757 {
758  // Compute the ratio between the original size of the image and the network size to translate network coordinates into
759  // image coordinates
760  float ratioh = (float)m_img.rows / netConfig.m_inputSize.height, ratiow = (float)m_img.cols / netConfig.m_inputSize.width;
761  size_t nbBatches = dnnRes.size();
762 
763  for (size_t i = 0; i < nbBatches; i++) {
764  // Counts the number of proposed detections and the number of data corresponding to 1 detection
765  int num_proposal = dnnRes[i].size[0]; // Number of detections
766  int nout = dnnRes[i].size[1]; // Number of data for each detection
767  if (dnnRes[i].dims > 2) {
768  num_proposal = dnnRes[i].size[1];
769  nout = dnnRes[i].size[2];
770  dnnRes[i] = dnnRes[i].reshape(0, num_proposal);
771  }
772 
773  int n = 0, row_ind = 0;
774  float *pdata = (float *)dnnRes[i].data;
775 
776  // Iterate on the detections to keep only the meaningful ones
777  for (n = 0; n < num_proposal; n++) {
778  float box_score = pdata[4];
779 
780  if (box_score > netConfig.m_confThreshold) {
781  cv::Mat scores = dnnRes[i].row(row_ind).colRange(5, nout);
782  cv::Point classIdPoint;
783  double max_class_score;
784  // Get the value and location of the maximum score
785  cv::minMaxLoc(scores, 0, &max_class_score, 0, &classIdPoint);
786  max_class_score *= box_score;
787 
788  // The detection is kept only if the confidence is greater than the threshold
789  if (max_class_score > netConfig.m_confThreshold) {
790  const int class_idx = classIdPoint.x;
791  float cx = pdata[0] * ratiow;
792  float cy = pdata[1] * ratioh;
793  float w = pdata[2] * ratiow;
794  float h = pdata[3] * ratioh;
795 
796  int left = int(cx - 0.5 * w);
797  int top = int(cy - 0.5 * h);
798 
799  proposals.m_confidences.push_back((float)max_class_score);
800  proposals.m_boxes.push_back(cv::Rect(left, top, (int)(w), (int)(h)));
801  proposals.m_classIds.push_back(class_idx);
802  }
803  }
804  row_ind++;
805  pdata += nout;
806  }
807  }
808 }
809 
821 void vpDetectorDNNOpenCV::postProcess_YoloV8_V11(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig)
822 {
823  // Code adapted from here: https://github.com/JustasBart/yolov8_CPP_Inference_OpenCV_ONNX/blob/minimalistic/inference.cpp
824  // Compute the ratio between the original size of the image and the network size to translate network coordinates into
825  // image coordinates
826  float ratioh = (float)m_img.rows / netConfig.m_inputSize.height, ratiow = (float)m_img.cols / netConfig.m_inputSize.width;
827  size_t nbBatches = dnnRes.size();
828 
829  for (size_t i = 0; i < nbBatches; i++) {
830  // Counts the number of proposed detections and the number of data corresponding to 1 detection
831  int num_proposal = dnnRes[i].size[1]; // Number of detections
832  int nout = dnnRes[i].size[0]; // Number of data for each detection
833  if (dnnRes[i].dims > 2) {
834  num_proposal = dnnRes[i].size[2];
835  nout = dnnRes[i].size[1];
836  dnnRes[i] = dnnRes[i].reshape(0, nout);
837  }
838  cv::transpose(dnnRes[i], dnnRes[i]); // Organise data as YoloV5 i.e. [batchsize][1:num_proposals][1:4+nb_classes]
839 
840  int n = 0, row_ind = 0;
841  float *pdata = (float *)dnnRes[i].data;
842 
843  // Iterate on the detections to keep only the meaningful ones
844  for (n = 0; n < num_proposal; n++) {
845  cv::Mat scores = dnnRes[i].row(row_ind).colRange(4, nout);
846  cv::Point classIdPoint;
847  double max_class_score;
848  // Get the value and location of the maximum score
849  cv::minMaxLoc(scores, 0, &max_class_score, 0, &classIdPoint);
850 
851  // The detection is kept only if the confidence is greater than the threshold
852  if (max_class_score > netConfig.m_confThreshold) {
853  const int class_idx = classIdPoint.x;
854  float cx = pdata[0] * ratiow;
855  float cy = pdata[1] * ratioh;
856  float w = pdata[2] * ratiow;
857  float h = pdata[3] * ratioh;
858 
859  int left = int(cx - 0.5 * w);
860  int top = int(cy - 0.5 * h);
861 
862  proposals.m_confidences.push_back((float)max_class_score);
863  proposals.m_boxes.push_back(cv::Rect(left, top, (int)(w), (int)(h)));
864  proposals.m_classIds.push_back(class_idx);
865  }
866 
867  row_ind++;
868  pdata += nout;
869  }
870  }
871 }
872 
884 void vpDetectorDNNOpenCV::postProcess_FasterRCNN(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig)
885 {
886  // Direct copy from object_detection.cpp OpenCV sample
887  // Faster-RCNN
888 
889  // Network produces output blob with a shape 1x1xNx7 where N is a number of
890  // detections and an every detection is a vector of values
891  // [batchId, classId, confidence, left, top, right, bottom]
892  size_t nbBatches = dnnRes.size();
893  for (size_t j = 0; j < nbBatches; j++) {
894  float *data = (float *)dnnRes[j].data;
895  for (size_t i = 0; i < dnnRes[j].total(); i += 7) {
896  float confidence = data[i + 2];
897  if (confidence > netConfig.m_confThreshold) {
898  int left = (int)(data[i + 3] * m_img.cols);
899  int top = (int)(data[i + 4] * m_img.rows);
900  int right = (int)(data[i + 5] * m_img.cols);
901  int bottom = (int)(data[i + 6] * m_img.rows);
902  int classId = (int)(data[i + 1]);
903 
904  proposals.m_confidences.push_back((float)confidence);
905  proposals.m_boxes.push_back(cv::Rect(left, top, right - left + 1, bottom - top + 1));
906  proposals.m_classIds.push_back(classId);
907  }
908  }
909  }
910 
911 }
912 
913 #if defined(VISP_BUILD_DEPRECATED_FUNCTIONS)
926 void vpDetectorDNNOpenCV::postProcess_SSD_MobileNet(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig)
927 {
928  // Network produces 2 outputs blob:
929  // - `scores` with dimensions 1xNxC
930  // - 'boxes' with dimensions 1xNx4
931  // where `N` is a number of detections and `C` is the number of classes (with `BACKGROUND` as classId = 0).
932 
933  int scores_index = m_outNames[0] == "scores" ? 0 : 1; // scores output index.
934  int boxes_index = m_outNames[0] == "boxes" ? 0 : 1; // boxes output index.
935 
936  int N = dnnRes[scores_index].size[1], C = dnnRes[scores_index].size[2];
937 
938  float *confidence = (float *)dnnRes[scores_index].data;
939  float *bbox = (float *)dnnRes[boxes_index].data;
940 
941  // Loop over all guesses on the output of the network.
942  for (int i = 0; i < N; i++) {
943  uint32_t maxClass = 0;
944  float maxScore = -1000.0f;
945 
946  for (int j = 1; j < C; j++) // ignore background (classId = 0).
947  {
948  const float score = confidence[i * C + j];
949 
950  if (score < netConfig.m_confThreshold)
951  continue;
952 
953  if (score > maxScore) {
954  maxScore = score;
955  maxClass = j;
956  }
957  }
958 
959  if (maxScore > netConfig.m_confThreshold) {
960  int left = (int)(bbox[4 * i] * m_img.cols);
961  int top = (int)(bbox[4 * i + 1] * m_img.rows);
962  int right = (int)(bbox[4 * i + 2] * m_img.cols);
963  int bottom = (int)(bbox[4 * i + 3] * m_img.rows);
964  int width = right - left + 1;
965  int height = bottom - top + 1;
966 
967  int classId = maxClass;
968  proposals.m_confidences.push_back(maxScore);
969  proposals.m_boxes.push_back(cv::Rect(left, top, width, height));
970  proposals.m_classIds.push_back(classId);
971  }
972  }
973 }
974 #endif
975 
987 void vpDetectorDNNOpenCV::postProcess_ResNet_10(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig)
988 {
989  // Direct copy from object_detection.cpp OpenCV sample
990 
991  // Network produces output blob with a shape 1x1xNx7 where N is a number of
992  // detections and an every detection is a vector of values
993  // [batchId, classId, confidence, left, top, right, bottom]
994  CV_Assert(dnnRes.size() == 1);
995  float *data = (float *)dnnRes[0].data;
996  for (size_t i = 0; i < dnnRes[0].total(); i += 7) {
997  float confidence = data[i + 2];
998  if (confidence > netConfig.m_confThreshold) {
999  int left = (int)(data[i + 3] * m_img.cols);
1000  int top = (int)(data[i + 4] * m_img.rows);
1001  int right = (int)(data[i + 5] * m_img.cols);
1002  int bottom = (int)(data[i + 6] * m_img.rows);
1003  int classId = (int)(data[i + 1]) - 1;
1004 
1005  proposals.m_confidences.push_back((float)confidence);
1006  proposals.m_boxes.push_back(cv::Rect(left, top, right - left + 1, bottom - top + 1));
1007  proposals.m_classIds.push_back(classId);
1008  }
1009  }
1010 }
1011 
1020 void vpDetectorDNNOpenCV::postProcess_unimplemented(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig)
1021 {
1022  (void)proposals;
1023  (void)dnnRes;
1024  (void)netConfig;
1025  throw(vpException(vpException::functionNotImplementedError, "vpDetectorDNNOpenCV::postProcess was called with a USER_SPECIFIED DNN but not post processing method was given."));
1026 }
1027 
1047 void vpDetectorDNNOpenCV::readNet(const std::string &model, const std::string &config, const std::string &framework)
1048 {
1049  m_netConfig.m_modelFilename = model;
1050  m_netConfig.m_modelConfigFilename = config;
1051  m_netConfig.m_framework = framework;
1052  m_net = cv::dnn::readNet(model, config, framework);
1053 #if (VISP_HAVE_OPENCV_VERSION == 0x030403)
1054  m_outNames = getOutputsNames();
1055 #else
1056  m_outNames = m_net.getUnconnectedOutLayersNames();
1057 #endif
1058 }
1059 
1066 void vpDetectorDNNOpenCV::setNetConfig(const NetConfig &config)
1067 {
1068  m_netConfig = config;
1069  setDetectionFilterSizeRatio(m_netConfig.m_filterSizeRatio);
1070  setParsingMethod(m_netConfig.m_parsingMethodType);
1071  if (!m_netConfig.m_modelFilename.empty()) {
1072  readNet(m_netConfig.m_modelFilename, m_netConfig.m_modelConfigFilename, m_netConfig.m_framework);
1073  }
1074 }
1075 
1081 void vpDetectorDNNOpenCV::setConfidenceThreshold(const float &confThreshold) { m_netConfig.m_confThreshold = confThreshold; }
1082 
1089 void vpDetectorDNNOpenCV::setNMSThreshold(const float &nmsThreshold) { m_netConfig.m_nmsThreshold = nmsThreshold; }
1090 
1098 void vpDetectorDNNOpenCV::setDetectionFilterSizeRatio(const double &sizeRatio)
1099 {
1100  m_netConfig.m_filterSizeRatio = sizeRatio;
1101  if (m_netConfig.m_filterSizeRatio > std::numeric_limits<double>::epsilon()) {
1102  m_applySizeFilterAfterNMS = true;
1103  }
1104  else {
1105  m_applySizeFilterAfterNMS = false;
1106  }
1107 }
1108 
1115 void vpDetectorDNNOpenCV::setInputSize(const int &width, const int &height)
1116 {
1117  m_netConfig.m_inputSize.width = width;
1118  m_netConfig.m_inputSize.height = height;
1119 }
1120 
1128 void vpDetectorDNNOpenCV::setMean(const double &meanR, const double &meanG, const double &meanB) { m_netConfig.m_mean = cv::Scalar(meanR, meanG, meanB); }
1129 
1136 void vpDetectorDNNOpenCV::setPreferableBackend(const int &backendId) { m_net.setPreferableBackend(backendId); }
1137 
1144 void vpDetectorDNNOpenCV::setPreferableTarget(const int &targetId) { m_net.setPreferableTarget(targetId); }
1145 
1149 void vpDetectorDNNOpenCV::setScaleFactor(const double &scaleFactor)
1150 {
1151  m_netConfig.m_scaleFactor = scaleFactor;
1152  if ((m_netConfig.m_parsingMethodType == YOLO_V7 || m_netConfig.m_parsingMethodType == YOLO_V8 || m_netConfig.m_parsingMethodType == YOLO_V11) && m_netConfig.m_scaleFactor != 1 / 255.) {
1153  std::cout << "[vpDetectorDNNOpenCV::setParsingMethod] WARNING: scale factor should be 1/255. to normalize pixels value." << std::endl;
1154  }
1155 }
1156 
1162 void vpDetectorDNNOpenCV::setSwapRB(const bool &swapRB) { m_netConfig.m_swapRB = swapRB; }
1163 
1171 void vpDetectorDNNOpenCV::setParsingMethod(const DNNResultsParsingType &typeParsingMethod, void (*parsingMethod)(DetectionCandidates &, std::vector<cv::Mat> &, const NetConfig &))
1172 {
1173  m_netConfig.m_parsingMethodType = typeParsingMethod;
1174  m_parsingMethod = parsingMethod;
1175  if ((m_netConfig.m_parsingMethodType == YOLO_V7 || m_netConfig.m_parsingMethodType == YOLO_V8 || m_netConfig.m_parsingMethodType == YOLO_V11) && m_netConfig.m_scaleFactor != 1 / 255.) {
1176  m_netConfig.m_scaleFactor = 1 / 255.;
1177  std::cout << "[vpDetectorDNNOpenCV::setParsingMethod] NB: scale factor changed to 1/255. to normalize pixels value." << std::endl;
1178  }
1179 
1180 #if defined(VISP_BUILD_DEPRECATED_FUNCTIONS)
1181  if (m_netConfig.m_parsingMethodType == SSD_MOBILENET) {
1182  std::cout << "[vpDetectorDNNOpenCV::setParsingMethod] WARNING: The chosen type of network is " << dnnResultsParsingTypeToString(m_netConfig.m_parsingMethodType) << " VISP_BUILD_DEPRECATED_FUNCTIONS is set to true." << std::endl;
1183  std::cout << "\tThe parsing method that worked with the networks quoted in the ViSP documentation was postProcess_ResNet_10 instead of postProcess_SSD_MobileNet." << std::endl;
1184  std::cout << "\tIf the SSD-MobileNet network does not seem to work, please try to recompile ViSP setting VISP_BUILD_DEPRECATED_FUNCTIONS as false." << std::endl << std::flush;
1185  }
1186 #endif
1187 }
1188 
1189 END_VISP_NAMESPACE
1190 #elif !defined(VISP_BUILD_SHARED_LIBS)
1191 // Work around to avoid warning: libvisp_core.a(vpDetectorDNNOpenCV.cpp.o) has no symbols
1192 void dummy_vpDetectorDNN() { };
1193 #endif
error that can be emitted by ViSP classes.
Definition: vpException.h:60
@ ioError
I/O error.
Definition: vpException.h:67
@ badValue
Used to indicate that a value is not in the allowed range.
Definition: vpException.h:73
@ functionNotImplementedError
Function not implemented.
Definition: vpException.h:66
static void convert(const vpImage< unsigned char > &src, vpImage< vpRGBa > &dest)
static std::string toLowerCase(const std::string &input)
Return a lower-case version of the string input . Numbers and special characters stay the same.
Definition: vpIoTools.cpp:1339