Visual Servoing Platform  version 3.6.1 under development (2024-11-15)
vpDetectorDNNOpenCV.h
1 /*
2  * ViSP, open source Visual Servoing Platform software.
3  * Copyright (C) 2005 - 2024 by Inria. All rights reserved.
4  *
5  * This software is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  * See the file LICENSE.txt at the root directory of this source
10  * distribution for additional information about the GNU GPL.
11  *
12  * For using ViSP with software that can not be combined with the GNU
13  * GPL, please contact Inria about acquiring a ViSP Professional
14  * Edition License.
15  *
16  * See https://visp.inria.fr for more information.
17  *
18  * This software was developed at:
19  * Inria Rennes - Bretagne Atlantique
20  * Campus Universitaire de Beaulieu
21  * 35042 Rennes Cedex
22  * France
23  *
24  * If you have questions regarding the use of this file, please contact
25  * Inria at visp@inria.fr
26  *
27  * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
28  * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
29  *
30  * Description:
31  * DNN object detection using OpenCV DNN module.
32  */
33 
34 #ifndef VP_DETECTOR_DNN_OPENCV_H
35 #define VP_DETECTOR_DNN_OPENCV_H
36 
37 #include <visp3/core/vpConfig.h>
38 
39 // Check if std:c++17 or higher.
40 // Here we cannot use (VISP_CXX_STANDARD >= VISP_CXX_STANDARD_17) in the declaration of the class
41 #if (VISP_HAVE_OPENCV_VERSION >= 0x030403) && defined(HAVE_OPENCV_DNN) && \
42  ((__cplusplus >= 201703L) || (defined(_MSVC_LANG) && (_MSVC_LANG >= 201703L)))
43 
44 #include <map>
45 #include <string>
46 #include <vector>
47 
48 #include <opencv2/dnn.hpp>
49 
50 #include <visp3/core/vpColor.h>
51 #include <visp3/core/vpDisplay.h>
52 #include <visp3/core/vpImage.h>
53 #include <visp3/core/vpRect.h>
54 
55 #include <optional>
56 
57 #ifdef VISP_HAVE_NLOHMANN_JSON
58 #include VISP_NLOHMANN_JSON(json.hpp)
59 #endif
60 
61 BEGIN_VISP_NAMESPACE
85 class VISP_EXPORT vpDetectorDNNOpenCV
86 {
87 public:
93  typedef enum DNNResultsParsingType
94  {
95  USER_SPECIFIED = 0,
96  FASTER_RCNN = 1,
97  SSD_MOBILENET = 2,
98  RESNET_10 = 3,
99  YOLO_V3 = 4,
100  YOLO_V4 = 5,
101  YOLO_V5 = 6,
102  YOLO_V7 = 7,
103  YOLO_V8 = 8,
104  YOLO_V11 = 9,
105  COUNT = 10
106  } DNNResultsParsingType;
107 
108  typedef struct DetectionCandidates
109  {
110  std::vector< float > m_confidences;
111  std::vector< cv::Rect > m_boxes;
112  std::vector< int > m_classIds;
113  } DetectionCandidates;
114 
120  typedef class DetectedFeatures2D
121  {
122  protected:
123  vpRect m_bbox;
124  double m_score;
125  unsigned int m_cls;
126  std::optional<std::string> m_classname;
127  public:
139  inline explicit DetectedFeatures2D(double u_min, double u_max
140  , double v_min, double v_max
141  , unsigned int cls, double score
142  , const std::optional<std::string> &classname
143  )
144  : m_bbox(vpImagePoint(v_min, u_min), vpImagePoint(v_max, u_max))
145  , m_score(score)
146  , m_cls(cls)
147  {
148  if (classname) {
149  m_classname = classname;
150  }
151  else {
152  m_classname = std::nullopt;
153  }
154  };
155 
159  inline vpRect getBoundingBox() const { return m_bbox; }
163  inline double getConfidenceScore() const { return m_score; }
167  inline unsigned int getClassId() const { return m_cls; }
171  inline std::optional<std::string> getClassName() const { return m_classname; }
172 
173  template < typename Type >
174  void display(const vpImage< Type > &img, const vpColor &color = vpColor::blue, unsigned int thickness = 1) const;
175 
176  friend vpDetectorDNNOpenCV;
177  } DetectedFeatures2D;
178 
183  typedef class NetConfig
184  {
185  private:
186  float m_confThreshold;
187  float m_nmsThreshold;
188  std::vector<std::string> m_classNames;
189  cv::Size m_inputSize;
190  double m_filterSizeRatio;
192  cv::Scalar m_mean;
193  double m_scaleFactor;
194  bool m_swapRB; /*<! If true, swap R and B for mean subtraction, e.g. when a model has been trained on BGR image format.*/
195  DNNResultsParsingType m_parsingMethodType;
196  std::string m_modelFilename;
197  std::string m_modelConfigFilename; /*<! Path towards the model additional configuration file, e.g. pbtxt file.*/
198  std::string m_framework;
200 #ifdef VISP_HAVE_NLOHMANN_JSON
208  friend inline void from_json(const nlohmann::json &j, NetConfig &config)
209  {
210  config.m_confThreshold = j.value("confidenceThreshold", config.m_confThreshold);
211  if (config.m_confThreshold <= 0) {
212  throw vpException(vpException::badValue, "Confidence threshold should be > 0");
213  }
214 
215  config.m_nmsThreshold = j.value("nmsThreshold", config.m_nmsThreshold);
216  if (config.m_nmsThreshold <= 0) {
217  throw vpException(vpException::badValue, "Confidence threshold should be > 0");
218  }
219 
220  config.m_filterSizeRatio = j.value("filterSizeRatio", config.m_filterSizeRatio);
221 
222  config.m_classNames = j.value("classNames", config.m_classNames);
223 
224  std::pair<unsigned int, unsigned int> resolution = j.value("resolution", std::pair<unsigned int, unsigned int>(config.m_inputSize.width, config.m_inputSize.height));
225  config.m_inputSize.width = resolution.first;
226  config.m_inputSize.height = resolution.second;
227 
228  std::vector<double> v_mean = j.value("mean", std::vector<double>({ config.m_mean[0], config.m_mean[1], config.m_mean[2] }));
229  if (v_mean.size() != 3) {
230  throw(vpException(vpException::dimensionError, "Mean should have size = 3"));
231  }
232  config.m_mean = cv::Scalar(v_mean[0], v_mean[1], v_mean[2]);
233 
234  config.m_scaleFactor = j.value("scale", config.m_scaleFactor);
235  config.m_swapRB = j.value("swapRB", config.m_swapRB);
236  config.m_parsingMethodType = dnnResultsParsingTypeFromString(j.value("parsingType", dnnResultsParsingTypeToString(config.m_parsingMethodType)));
237  config.m_modelFilename = j.value("modelFile", config.m_modelFilename);
238  config.m_modelConfigFilename = j.value("configurationFile", config.m_modelConfigFilename);
239  config.m_framework = j.value("framework", config.m_framework);
240  }
241 
248  friend inline void to_json(nlohmann::json &j, const NetConfig &config)
249  {
250  std::pair<unsigned int, unsigned int> resolution = { config.m_inputSize.width, config.m_inputSize.height };
251  std::vector<double> v_mean = { config.m_mean[0], config.m_mean[1], config.m_mean[2] };
252  j = nlohmann::json {
253  {"confidenceThreshold", config.m_confThreshold } ,
254  {"nmsThreshold" , config.m_nmsThreshold } ,
255  {"filterSizeRatio" , config.m_filterSizeRatio} ,
256  {"classNames" , config.m_classNames } ,
257  {"resolution" , resolution } ,
258  {"mean" , v_mean } ,
259  {"scale" , config.m_scaleFactor } ,
260  {"swapRB" , config.m_swapRB } ,
261  {"parsingType" , dnnResultsParsingTypeToString(config.m_parsingMethodType) },
262  {"modelFile" , config.m_modelFilename } ,
263  {"configurationFile" , config.m_modelConfigFilename } ,
264  {"framework" , config.m_framework }
265  };
266  }
267 #endif
268 
269  public:
292  inline static std::vector<std::string> parseClassNamesFile(const std::string &filename)
293  {
294  std::vector<std::string> classNames;
295  std::ifstream ifs(filename);
296  std::string line;
297  while (getline(ifs, line)) {
298  if (line.find("[") == std::string::npos) {
299  classNames.push_back(line);
300  }
301  else {
302  std::string lineWithoutBracket;
303  if (line.find("[") != std::string::npos) {
304  lineWithoutBracket = line.substr(line.find("[") + 1, line.size() - 2); // Remove opening and closing brackets
305  }
306 
307  while (!lineWithoutBracket.empty()) {
308  std::string className;
309  auto start_pos = lineWithoutBracket.find("\"");
310  auto end_pos = lineWithoutBracket.find("\"", start_pos + 1);
311  className = lineWithoutBracket.substr(start_pos + 1, end_pos - (start_pos + 1));
312  if (!className.empty()) {
313  classNames.push_back(className);
314  lineWithoutBracket = lineWithoutBracket.substr(end_pos + 1);
315  }
316  }
317  }
318  }
319  return classNames;
320  }
321 
325  inline NetConfig()
326  : m_confThreshold(0.5f)
327  , m_nmsThreshold(0.4f)
328  , m_classNames()
329  , m_inputSize(300, 300)
330  , m_filterSizeRatio(0.)
331  , m_mean(127.5, 127.5, 127.5)
332  , m_scaleFactor(2.0 / 255.0)
333  , m_swapRB(true)
334  , m_parsingMethodType(vpDetectorDNNOpenCV::USER_SPECIFIED)
335  , m_modelFilename()
336  , m_modelConfigFilename()
337  , m_framework()
338  {
339 
340  }
341 
342  inline NetConfig(const NetConfig &config)
343  : m_confThreshold(config.m_confThreshold)
344  , m_nmsThreshold(config.m_nmsThreshold)
345  , m_classNames(config.m_classNames)
346  , m_inputSize(config.m_inputSize.width, config.m_inputSize.height)
347  , m_filterSizeRatio(config.m_filterSizeRatio)
348  , m_mean(cv::Scalar(config.m_mean[0], config.m_mean[1], config.m_mean[2]))
349  , m_scaleFactor(config.m_scaleFactor)
350  , m_swapRB(config.m_swapRB)
351  , m_parsingMethodType(config.m_parsingMethodType)
352  , m_modelFilename(config.m_modelFilename)
353  , m_modelConfigFilename(config.m_modelConfigFilename)
354  , m_framework(config.m_framework)
355  {
356 
357  }
358 
376  inline NetConfig(float confThresh, const float &nmsThresh, const std::vector<std::string> &classNames, const cv::Size &dnnInputSize, const double &filterSizeRatio = 0.
377  , const cv::Scalar &mean = cv::Scalar(127.5, 127.5, 127.5), const double &scaleFactor = 2. / 255., const bool &swapRB = true
378  , const DNNResultsParsingType &parsingType = vpDetectorDNNOpenCV::USER_SPECIFIED, const std::string &modelFilename = "", const std::string &configFilename = "", const std::string &framework = "")
379  : m_confThreshold(confThresh)
380  , m_nmsThreshold(nmsThresh)
381  , m_classNames(classNames)
382  , m_inputSize(dnnInputSize)
383  , m_filterSizeRatio(filterSizeRatio)
384  , m_mean(mean)
385  , m_scaleFactor(scaleFactor)
386  , m_swapRB(swapRB)
387  , m_parsingMethodType(parsingType)
388  , m_modelFilename(modelFilename)
389  , m_modelConfigFilename(configFilename)
390  , m_framework(framework)
391  { }
392 
410  inline NetConfig(const float &confThresh, const float &nmsThresh, const std::string &classNamesFile, const cv::Size &dnnInputSize, const double &filterSizeRatio = 0.
411  , const cv::Scalar &mean = cv::Scalar(127.5, 127.5, 127.5), const double &scaleFactor = 2. / 255., const bool &swapRB = true
412  , const DNNResultsParsingType &parsingType = vpDetectorDNNOpenCV::USER_SPECIFIED, const std::string &modelFilename = "", const std::string &configFilename = "", const std::string &framework = "")
413  : m_confThreshold(confThresh)
414  , m_nmsThreshold(nmsThresh)
415  , m_inputSize(dnnInputSize)
416  , m_filterSizeRatio(filterSizeRatio)
417  , m_mean(mean)
418  , m_scaleFactor(scaleFactor)
419  , m_swapRB(swapRB)
420  , m_parsingMethodType(parsingType)
421  , m_modelFilename(modelFilename)
422  , m_modelConfigFilename(configFilename)
423  , m_framework(framework)
424  {
425  m_classNames = parseClassNamesFile(classNamesFile);
426  }
427 
428  inline std::string toString() const
429  {
430  std::string text;
431  text += "Model : " + m_modelFilename + "\n";
432  text += "Type : " + vpDetectorDNNOpenCV::dnnResultsParsingTypeToString(m_parsingMethodType) + "\n";
433  text += "Config (optional): " + (m_modelConfigFilename.empty() ? "\"None\"" : m_modelConfigFilename) + "\n";
434  text += "Framework (optional): " + (m_framework.empty() ? "\"None\"" : m_framework) + "\n";
435  text += "Width x Height : " + std::to_string(m_inputSize.width) + " x " + std::to_string(m_inputSize.height) + "\n";
436  text += "Mean RGB : " + std::to_string(m_mean[0]) + " " + std::to_string(m_mean[1]) + " " + std::to_string(m_mean[2]) + "\n";
437  text += "Scale : " + std::to_string(m_scaleFactor) + "\n";
438  text += "Swap RB? : " + (m_swapRB ? std::string("true") : std::string("false")) + "\n";
439  text += "Confidence threshold : " + std::to_string(m_confThreshold) + "\n";
440  text += "NMS threshold : " + std::to_string(m_nmsThreshold) + "\n";
441  text += "Filter threshold : " +
442  (m_filterSizeRatio > std::numeric_limits<double>::epsilon() ? std::to_string(m_filterSizeRatio)
443  : "disabled") + "\n";
444  return text;
445  }
446 
447  friend inline std::ostream &operator<<(std::ostream &os, const NetConfig &config)
448  {
449  os << config.toString();
450  return os;
451  }
452 
453  NetConfig &operator=(const NetConfig &config)
454  {
455  m_confThreshold = config.m_confThreshold;
456  m_nmsThreshold = config.m_nmsThreshold;
457  m_classNames = config.m_classNames;
458  m_inputSize = cv::Size(config.m_inputSize.width, config.m_inputSize.height);
459  m_filterSizeRatio = config.m_filterSizeRatio;
460  m_mean = cv::Scalar(config.m_mean[0], config.m_mean[1], config.m_mean[2]);
461  m_scaleFactor = config.m_scaleFactor;
462  m_swapRB = config.m_swapRB;
463  m_parsingMethodType = config.m_parsingMethodType;
464  m_modelFilename = config.m_modelFilename;
465  m_modelConfigFilename = config.m_modelConfigFilename;
466  m_framework = config.m_framework;
467  return *this;
468  }
469 
470  friend vpDetectorDNNOpenCV;
471  } NetConfig;
472 
473  static std::string getAvailableDnnResultsParsingTypes();
474  static std::string dnnResultsParsingTypeToString(const DNNResultsParsingType &type);
475  static DNNResultsParsingType dnnResultsParsingTypeFromString(const std::string &name);
476  static std::vector<std::string> parseClassNamesFile(const std::string &filename);
477  vpDetectorDNNOpenCV();
478  vpDetectorDNNOpenCV(const NetConfig &config, const DNNResultsParsingType &typeParsingMethod, void (*parsingMethod)(DetectionCandidates &, std::vector<cv::Mat> &, const NetConfig &) = postProcess_unimplemented);
479 #ifdef VISP_HAVE_NLOHMANN_JSON
480  vpDetectorDNNOpenCV(const std::string &jsonPath, void (*parsingMethod)(DetectionCandidates &, std::vector<cv::Mat> &, const NetConfig &) = postProcess_unimplemented);
481  void initFromJSON(const std::string &jsonPath);
482  void saveConfigurationInJSON(const std::string &jsonPath) const;
483 #endif
484  virtual ~vpDetectorDNNOpenCV();
485 
486  virtual bool detect(const vpImage<unsigned char> &I, std::vector<DetectedFeatures2D> &output);
487  virtual bool detect(const vpImage<unsigned char> &I, std::map< std::string, std::vector<DetectedFeatures2D>> &output);
488  virtual bool detect(const vpImage<unsigned char> &I, std::vector< std::pair<std::string, std::vector<DetectedFeatures2D>>> &output);
489  virtual bool detect(const vpImage<vpRGBa> &I, std::vector<DetectedFeatures2D> &output);
490  virtual bool detect(const vpImage<vpRGBa> &I, std::map< std::string, std::vector<DetectedFeatures2D>> &output);
491  virtual bool detect(const vpImage<vpRGBa> &I, std::vector< std::pair<std::string, std::vector<DetectedFeatures2D>>> &output);
492  virtual bool detect(const cv::Mat &I, std::vector<DetectedFeatures2D> &output);
493  virtual bool detect(const cv::Mat &I, std::map< std::string, std::vector<DetectedFeatures2D>> &output);
494  virtual bool detect(const cv::Mat &I, std::vector< std::pair<std::string, std::vector<DetectedFeatures2D>>> &output);
495 
496  void readNet(const std::string &model, const std::string &config = "", const std::string &framework = "");
497 
498  void setNetConfig(const NetConfig &config);
499  void setConfidenceThreshold(const float &confThreshold);
500  void setNMSThreshold(const float &nmsThreshold);
501  void setDetectionFilterSizeRatio(const double &sizeRatio);
502  void setInputSize(const int &width, const int &height);
503  void setMean(const double &meanR, const double &meanG, const double &meanB);
504  void setPreferableBackend(const int &backendId);
505  void setPreferableTarget(const int &targetId);
506  void setScaleFactor(const double &scaleFactor);
507  void setSwapRB(const bool &swapRB);
508  void setParsingMethod(const DNNResultsParsingType &typeParsingMethod, void (*parsingMethod)(DetectionCandidates &, std::vector<cv::Mat> &, const NetConfig &) = postProcess_unimplemented);
509  inline const NetConfig &getNetConfig() const
510  {
511  return m_netConfig;
512  }
513 
514 #ifdef VISP_HAVE_NLOHMANN_JSON
522  friend inline void from_json(const nlohmann::json &j, vpDetectorDNNOpenCV &network)
523  {
524  network.m_netConfig = j.value("networkSettings", network.m_netConfig);
525  }
526 
533  friend inline void to_json(nlohmann::json &j, const vpDetectorDNNOpenCV &network)
534  {
535  j = nlohmann::json {
536  {"networkSettings", network.m_netConfig}
537  };
538  }
539 #endif
540 
541  friend inline std::ostream &operator<<(std::ostream &os, const vpDetectorDNNOpenCV &network)
542  {
543  os << network.m_netConfig;
544  return os;
545  }
546 
547 protected:
548 #if (VISP_HAVE_OPENCV_VERSION == 0x030403)
549  std::vector<cv::String> getOutputsNames();
550 #endif
551  std::vector<DetectedFeatures2D>
552  filterDetectionSingleClassInput(const std::vector<DetectedFeatures2D> &detected_features, const double minRatioOfAreaOk);
553 
554  std::vector<DetectedFeatures2D>
555  filterDetectionMultiClassInput(const std::vector<DetectedFeatures2D> &detected_features, const double minRatioOfAreaOk);
556 
557  std::map<std::string, std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D>>
558  filterDetectionMultiClassInput(const std::map< std::string, std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D>> &detected_features, const double minRatioOfAreaOk);
559 
560  void postProcess(DetectionCandidates &proposals);
561 
562  void postProcess_YoloV3_V4(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig);
563 
564  void postProcess_YoloV5_V7(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig);
565 
566  void postProcess_YoloV8_V11(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig);
567 
568  void postProcess_FasterRCNN(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig);
569 
570 #if defined(VISP_BUILD_DEPRECATED_FUNCTIONS)
571  void postProcess_SSD_MobileNet(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig);
572 #endif
573 
574  void postProcess_ResNet_10(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig);
575 
576  static void postProcess_unimplemented(DetectionCandidates &proposals, std::vector<cv::Mat> &dnnRes, const NetConfig &netConfig);
577 
579  bool m_applySizeFilterAfterNMS;
581  cv::Mat m_blob;
583  vpImage<vpRGBa> m_I_color;
585  cv::Mat m_img;
587  std::vector<int> m_indices;
589  cv::dnn::Net m_net;
591  NetConfig m_netConfig;
593  std::vector<cv::String> m_outNames;
595  std::vector<cv::Mat> m_dnnRes;
597  void (*m_parsingMethod)(DetectionCandidates &, std::vector<cv::Mat> &, const NetConfig &);
598 };
599 
607 template < typename Type >
608 inline void
609 vpDetectorDNNOpenCV::DetectedFeatures2D::display(const vpImage< Type > &img, const vpColor &color, unsigned int thickness) const
610 {
611  vpDisplay::displayRectangle(img, m_bbox, color, false, thickness);
612 
613  std::stringstream ss;
614  if (m_classname) {
615  ss << *m_classname;
616  }
617  else {
618  ss << m_cls;
619  }
620  ss << "(" << std::setprecision(4) << m_score * 100. << "%)";
621  vpDisplay::displayText(img, m_bbox.getTopRight(), ss.str(), color);
622 }
623 END_VISP_NAMESPACE
624 #endif
625 #endif
Class to define RGB colors available for display functionalities.
Definition: vpColor.h:157
static const vpColor blue
Definition: vpColor.h:223
static void displayRectangle(const vpImage< unsigned char > &I, const vpImagePoint &topLeft, unsigned int width, unsigned int height, const vpColor &color, bool fill=false, unsigned int thickness=1)
static void displayText(const vpImage< unsigned char > &I, const vpImagePoint &ip, const std::string &s, const vpColor &color)
error that can be emitted by ViSP classes.
Definition: vpException.h:60
@ badValue
Used to indicate that a value is not in the allowed range.
Definition: vpException.h:73
@ dimensionError
Bad dimension.
Definition: vpException.h:71
Class that defines a 2D point in an image. This class is useful for image processing and stores only ...
Definition: vpImagePoint.h:82
Definition of the vpImage class member functions.
Definition: vpImage.h:131
Defines a rectangle in the plane.
Definition: vpRect.h:79
vpImagePoint getTopRight() const
Definition: vpRect.h:212