Visual Servoing Platform  version 3.6.1 under development (2024-09-09)
tutorial-dnn-object-detection-live.cpp
1 #include <visp3/core/vpConfig.h>
3 #include <visp3/core/vpIoTools.h>
4 #include <visp3/detection/vpDetectorDNNOpenCV.h>
5 #include <visp3/gui/vpDisplayGDI.h>
6 #include <visp3/gui/vpDisplayOpenCV.h>
7 #include <visp3/gui/vpDisplayX.h>
8 
9 #if defined(HAVE_OPENCV_VIDEOIO)
10 #include <opencv2/videoio.hpp>
11 #endif
12 
13 #ifdef VISP_HAVE_NLOHMANN_JSON
14 #include <nlohmann/json.hpp>
15 using json = nlohmann::json;
16 #endif
17 
18 #ifdef ENABLE_VISP_NAMESPACE
19 using namespace VISP_NAMESPACE_NAME;
20 #endif
21 
22 typedef enum
23 {
24  DETECTION_CONTAINER_MAP = 0,
25  DETECTION_CONTAINER_VECTOR = 1,
26  DETECTION_CONTAINER_BOTH = 2,
27  DETECTION_CONTAINER_COUNT = 3
28 } ChosenDetectionContainer;
29 
30 std::string chosenDetectionContainerToString(const ChosenDetectionContainer &choice)
31 {
32  switch (choice) {
33  case DETECTION_CONTAINER_MAP:
34  return "map";
35  case DETECTION_CONTAINER_VECTOR:
36  return "vector";
37  case DETECTION_CONTAINER_BOTH:
38  return "both";
39  default:
40  break;
41  }
42  return "unknown";
43 }
44 
45 ChosenDetectionContainer chosenDetectionContainerFromString(const std::string &choiceStr)
46 {
47  ChosenDetectionContainer choice(DETECTION_CONTAINER_COUNT);
48  bool hasFoundMatch = false;
49  for (unsigned int i = 0; i < DETECTION_CONTAINER_COUNT && !hasFoundMatch; i++) {
50  ChosenDetectionContainer candidate = (ChosenDetectionContainer)i;
51  hasFoundMatch = (chosenDetectionContainerToString(candidate) == vpIoTools::toLowerCase(choiceStr));
52  if (hasFoundMatch) {
53  choice = candidate;
54  }
55  }
56  return choice;
57 }
58 
59 std::string getAvailableDetectionContainer()
60 {
61  std::string availableContainers("< ");
62  for (unsigned int i = 0; i < DETECTION_CONTAINER_COUNT - 1; i++) {
63  std::string name = chosenDetectionContainerToString((ChosenDetectionContainer)i);
64  availableContainers += name + " , ";
65  }
66  availableContainers +=
67  chosenDetectionContainerToString((ChosenDetectionContainer)(DETECTION_CONTAINER_COUNT - 1)) + " >";
68  return availableContainers;
69 }
70 
71 int main(int argc, const char *argv[])
72 {
73  // Check if std:c++17 or higher
74 #if defined(HAVE_OPENCV_DNN) && defined(HAVE_OPENCV_VIDEOIO) && \
75  ((__cplusplus >= 201703L) || (defined(_MSVC_LANG) && (_MSVC_LANG >= 201703L)))
76  try {
77  std::string opt_device("0");
79  std::string opt_dnn_model = "opencv_face_detector_uint8.pb";
80  std::string opt_dnn_config = "opencv_face_detector.pbtxt";
81  std::string opt_dnn_framework = "none";
82  std::string opt_dnn_label_file = "";
85  int opt_dnn_width = 300, opt_dnn_height = 300;
86  double opt_dnn_meanR = 104.0, opt_dnn_meanG = 177.0, opt_dnn_meanB = 123.0;
87  double opt_dnn_scale_factor = 1.0;
88  bool opt_dnn_swapRB = false;
89  bool opt_step_by_step = false;
90  float opt_dnn_confThresh = 0.5f;
91  float opt_dnn_nmsThresh = 0.4f;
92  double opt_dnn_filterThresh = 0.25;
93  ChosenDetectionContainer opt_dnn_containerType = DETECTION_CONTAINER_MAP;
94  bool opt_verbose = false;
95  std::string opt_input_json = "";
96  std::string opt_output_json = "";
97 
98  for (int i = 1; i < argc; i++) {
99  if (std::string(argv[i]) == "--device" && i + 1 < argc) {
100  opt_device = std::string(argv[++i]);
101  }
102  else if (std::string(argv[i]) == "--step-by-step") {
103  opt_step_by_step = true;
104  }
105  else if (std::string(argv[i]) == "--model" && i + 1 < argc) {
106  opt_dnn_model = std::string(argv[++i]);
107  }
108  else if (std::string(argv[i]) == "--type" && i + 1 < argc) {
109  opt_dnn_type = vpDetectorDNNOpenCV::dnnResultsParsingTypeFromString(std::string(argv[++i]));
110  }
111  else if (std::string(argv[i]) == "--config" && i + 1 < argc) {
112  opt_dnn_config = std::string(argv[++i]);
113  if (opt_dnn_config.find("none") != std::string::npos) {
114  opt_dnn_config = std::string();
115  }
116  }
117  else if (std::string(argv[i]) == "--framework" && i + 1 < argc) {
118  opt_dnn_framework = std::string(argv[++i]);
119  if (opt_dnn_framework.find("none") != std::string::npos) {
120  opt_dnn_framework = std::string();
121  }
122  }
123  else if (std::string(argv[i]) == "--width" && i + 1 < argc) {
124  opt_dnn_width = atoi(argv[++i]);
125  }
126  else if (std::string(argv[i]) == "--height" && i + 1 < argc) {
127  opt_dnn_height = atoi(argv[++i]);
128  }
129  else if (std::string(argv[i]) == "--mean" && i + 3 < argc) {
130  opt_dnn_meanR = atof(argv[++i]);
131  opt_dnn_meanG = atof(argv[++i]);
132  opt_dnn_meanB = atof(argv[++i]);
133  }
134  else if (std::string(argv[i]) == "--scale" && i + 1 < argc) {
135  opt_dnn_scale_factor = atof(argv[++i]);
136  }
137  else if (std::string(argv[i]) == "--swapRB") {
138  opt_dnn_swapRB = true;
139  }
140  else if (std::string(argv[i]) == "--confThresh" && i + 1 < argc) {
141  opt_dnn_confThresh = (float)atof(argv[++i]);
142  }
143  else if (std::string(argv[i]) == "--nmsThresh" && i + 1 < argc) {
144  opt_dnn_nmsThresh = (float)atof(argv[++i]);
145  }
146  else if (std::string(argv[i]) == "--filterThresh" && i + 1 < argc) {
147  opt_dnn_filterThresh = atof(argv[++i]);
148  }
149  else if (std::string(argv[i]) == "--labels" && i + 1 < argc) {
150  opt_dnn_label_file = std::string(argv[++i]);
151  }
152  else if (std::string(argv[i]) == "--container" && i + 1 < argc) {
153  opt_dnn_containerType = chosenDetectionContainerFromString(std::string(argv[++i]));
154  }
155  else if (std::string(argv[i]) == "--input-json" && i + 1 < argc) {
156  opt_input_json = std::string(std::string(argv[++i]));
157  }
158  else if (std::string(argv[i]) == "--output-json" && i + 1 < argc) {
159  opt_output_json = std::string(std::string(argv[++i]));
160  }
161  else if (std::string(argv[i]) == "--verbose" || std::string(argv[i]) == "-v") {
162  opt_verbose = true;
163  }
164  else if (std::string(argv[i]) == "--help" || std::string(argv[i]) == "-h") {
165  std::cout << "\nSYNOPSIS " << std::endl
166  << argv[0] << " [--device <video>]"
167  << " [--model <dnn weights file>]"
168  << " [--type <dnn type>]"
169  << " [--config <dnn config file]"
170  << " [--framework <name>]"
171  << " [--width <blob width>] [--height <blob height>]"
172  << " [--mean <meanR meanG meanB>]"
173  << " [--scale <scale factor>]"
174  << " [--swapRB]"
175  << " [--confThresh <threshold>]"
176  << " [--nmsThresh <threshold>]"
177  << " [--filterThresh <threshold>]"
178  << " [--labels <file>]"
179  << " [--container <type>]"
180  << " [--input-json <path_to_input_json>]"
181  << " [--output-json <path_to_output_json>]"
182  << " [--step-by-step]"
183  << " [--verbose, -v]"
184  << " [--help, -h]" << std::endl;
185  std::cout << "\nOPTIONS " << std::endl
186  << " --device <video>" << std::endl
187  << " Camera device number or video name used to stream images." << std::endl
188  << " To use the first camera found on the bus set 0. On Ubuntu setting 0" << std::endl
189  << " will use /dev/video0 device. To use a video simply put the name of" << std::endl
190  << " the video, like \"path/my-video.mp4\" or \"path/image-%04d.png\"" << std::endl
191  << " if your video is a sequence of images." << std::endl
192  << " Default: " << opt_device << std::endl
193  << std::endl
194  << " --model <dnn weights file>" << std::endl
195  << " Path to dnn network trained weights." << std::endl
196  << " Default: " << opt_dnn_model << std::endl
197  << std::endl
198  << " --type <dnn type>" << std::endl
199  << " Type of dnn network. Admissible values are in " << std::endl
201  << " Default: " << opt_dnn_type << std::endl
202  << std::endl
203  << " --config <dnn config file>" << std::endl
204  << " Path to dnn network config file or \"none\" not to use one. " << std::endl
205  << " Default: " << opt_dnn_config << std::endl
206  << std::endl
207  << " --framework <name>" << std::endl
208  << " Framework name or \"none\" not to specify one. " << std::endl
209  << " Default: " << opt_dnn_framework << std::endl
210  << std::endl
211  << " --width <blob width>" << std::endl
212  << " Input images will be resized to this width. " << std::endl
213  << " Default: " << opt_dnn_width << std::endl
214  << std::endl
215  << " --height <blob height>" << std::endl
216  << " Input images will be resized to this height. " << std::endl
217  << " Default: " << opt_dnn_height << std::endl
218  << std::endl
219  << " --mean <meanR meanG meanB>" << std::endl
220  << " Mean RGB subtraction values. " << std::endl
221  << " Default: " << opt_dnn_meanR << " " << opt_dnn_meanG << " " << opt_dnn_meanB << std::endl
222  << std::endl
223  << " --scale <scale factor>" << std::endl
224  << " Scale factor used to normalize the range of pixel values. " << std::endl
225  << " Default: " << opt_dnn_scale_factor << std::endl
226  << std::endl
227  << " --swapRB" << std::endl
228  << " When used this option allows to swap Red and Blue channels. " << std::endl
229  << std::endl
230  << " --confThresh <threshold>" << std::endl
231  << " Confidence threshold. " << std::endl
232  << " Default: " << opt_dnn_confThresh << std::endl
233  << std::endl
234  << " --nmsThresh <threshold>" << std::endl
235  << " Non maximum suppression threshold. " << std::endl
236  << " Default: " << opt_dnn_nmsThresh << std::endl
237  << std::endl
238  << " --filterThresh <threshold >" << std::endl
239  << " Filter threshold. Set 0. to disable." << std::endl
240  << " Default: " << opt_dnn_filterThresh << std::endl
241  << std::endl
242  << " --labels <file>" << std::endl
243  << " Path to label file either in txt or yaml format. Keep empty if unknown." << std::endl
244  << " Default: \"" << opt_dnn_label_file << "\"" << std::endl
245  << std::endl
246  << " --container <type>" << std::endl
247  << " Container type in " << getAvailableDetectionContainer() << std::endl
248  << " Default: " << chosenDetectionContainerToString(opt_dnn_containerType) << std::endl
249  << std::endl
250  << " --input-json <path_to_input_json>" << std::endl
251  << " Input JSON file used to configure the DNN. If set, the other arguments will be used to override the values set in the json file." << std::endl
252  << " Default: empty" << std::endl
253  << std::endl
254  << " --output-json <type>" << std::endl
255  << " Output JSON file where will be saved the DNN configuration. If empty, does not save the configuration." << std::endl
256  << " Default: empty" << std::endl
257  << std::endl
258  << " --step-by-step" << std::endl
259  << " Enable step by step mode, waiting for a user click to process next image." << std::endl
260  << std::endl
261  << " --verbose, -v" << std::endl
262  << " Enable verbose mode." << std::endl
263  << std::endl
264  << " --help, -h" << std::endl
265  << " Display this helper message." << std::endl
266  << std::endl;
267  return EXIT_SUCCESS;
268  }
269  }
270 
271  std::cout << "Video device : " << opt_device << std::endl;
272  std::cout << "Label file (optional): " << (opt_dnn_label_file.empty() ? "None" : opt_dnn_label_file) << std::endl;
273 
274  cv::VideoCapture capture;
275  bool hasCaptureOpeningSucceeded;
276  if (vpMath::isNumber(opt_device)) {
277  hasCaptureOpeningSucceeded = capture.open(std::atoi(opt_device.c_str()));
278  }
279  else {
280  hasCaptureOpeningSucceeded = capture.open(opt_device);
281  }
282  if (!hasCaptureOpeningSucceeded) {
283  std::cout << "Capture from camera: " << opt_device << " didn't work" << std::endl;
284  return EXIT_FAILURE;
285  }
286 
287  vpImage<vpRGBa> I;
288 #if defined(VISP_HAVE_X11)
289  vpDisplayX d;
290 #elif defined(VISP_HAVE_GDI)
291  vpDisplayGDI d;
292 #elif defined(HAVE_OPENCV_HIGHGUI)
293  vpDisplayOpenCV d;
294 #endif
296 
297  if (!opt_dnn_label_file.empty() && !vpIoTools::checkFilename(opt_dnn_label_file)) {
299  "The file containing the classes labels \"" + opt_dnn_label_file + "\" does not exist !"));
300  }
301 
303 #ifdef VISP_HAVE_NLOHMANN_JSON
304  if (!opt_input_json.empty()) {
306  dnn.initFromJSON(opt_input_json);
308  }
309 #else
310  if (!opt_input_json.empty()) {
311  std::cerr << "Error: NLOHMANN JSON library is not installed, please install it following ViSP documentation to configure the vpDetectorDNNOpenCV from a JSON file." << std::endl;
312  return EXIT_FAILURE;
313  }
314 #endif
315  else {
317  vpDetectorDNNOpenCV::NetConfig netConfig(opt_dnn_confThresh, opt_dnn_nmsThresh, opt_dnn_label_file
318  , cv::Size(opt_dnn_width, opt_dnn_height), opt_dnn_filterThresh, cv::Scalar(opt_dnn_meanR, opt_dnn_meanG, opt_dnn_meanB)
319  , opt_dnn_scale_factor, opt_dnn_swapRB, opt_dnn_type
320  , opt_dnn_model, opt_dnn_config, opt_dnn_framework
321  );
322  dnn.setNetConfig(netConfig);
324  }
325 
326  std::cout << dnn.getNetConfig() << std::endl;
327 
328 #ifdef VISP_HAVE_NLOHMANN_JSON
329  if (!opt_output_json.empty()) {
330  dnn.saveConfigurationInJSON(opt_output_json);
331  }
332 #else
333  if (!opt_output_json.empty()) {
334  std::cerr << "Error: NLOHMANN JSON library is not installed, please install it following ViSP documentation to save the configuration in a JSON file." << std::endl;
335  }
336 #endif
337 
338  cv::Mat frame;
339  while (true) {
340  capture >> frame;
341  if (frame.empty())
342  break;
343 
344  if (I.getSize() == 0) {
345  vpImageConvert::convert(frame, I);
346  d.init(I);
347  vpDisplay::setTitle(I, "DNN object detection");
348  if (opt_verbose) {
349  std::cout << "Process image: " << I.getWidth() << " x " << I.getHeight() << std::endl;
350  }
351  }
352  else {
353  vpImageConvert::convert(frame, I);
354  }
355  if (opt_verbose) {
356  std::cout << "Process new image" << std::endl;
357  }
358 
360 
361  if (opt_dnn_containerType == DETECTION_CONTAINER_MAP || opt_dnn_containerType == DETECTION_CONTAINER_BOTH) {
362  double t = vpTime::measureTimeMs();
364  std::map<std::string, std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D> > detections;
365  dnn.detect(frame, detections);
367  t = vpTime::measureTimeMs() - t;
368 
370  for (auto key_val : detections) {
371  if (opt_verbose) {
372  std::cout << " Class name : " << key_val.first << std::endl;
373  }
374  for (vpDetectorDNNOpenCV::DetectedFeatures2D detection : key_val.second) {
375  if (opt_verbose) {
376  std::cout << " Bounding box : " << detection.getBoundingBox() << std::endl;
377  std::cout << " Class Id : " << detection.getClassId() << std::endl;
378  if (detection.getClassName())
379  std::cout << " Class name : " << detection.getClassName().value() << std::endl;
380  std::cout << " Confidence score: " << detection.getConfidenceScore() << std::endl;
381  }
382  detection.display(I);
383  }
384  }
386 
387  std::ostringstream oss_map;
388  oss_map << "Detection time (map): " << t << " ms";
389  if (opt_verbose) {
390  // Displaying timing result in console
391  std::cout << " " << oss_map.str() << std::endl;
392  }
393  // Displaying timing result on the image
394  vpDisplay::displayText(I, 60, 20, oss_map.str(), vpColor::red);
395  }
396 
397  if (opt_dnn_containerType == DETECTION_CONTAINER_VECTOR || opt_dnn_containerType == DETECTION_CONTAINER_BOTH) {
398  double t_vector = vpTime::measureTimeMs();
400  std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D> detections_vec;
401  dnn.detect(frame, detections_vec);
403  t_vector = vpTime::measureTimeMs() - t_vector;
404 
406  for (auto detection : detections_vec) {
407  if (opt_verbose) {
408  std::cout << " Bounding box : " << detection.getBoundingBox() << std::endl;
409  std::cout << " Class Id : " << detection.getClassId() << std::endl;
410  std::optional<std::string> classname_opt = detection.getClassName();
411  std::cout << " Class name : " << (classname_opt ? *classname_opt : "Not known") << std::endl;
412  std::cout << " Confidence score: " << detection.getConfidenceScore() << std::endl;
413  }
414  detection.display(I);
415  }
417 
418  std::ostringstream oss_vec;
419  oss_vec << "Detection time (vector): " << t_vector << " ms";
420  if (opt_verbose) {
421  // Displaying timing result in console
422  std::cout << " " << oss_vec.str() << std::endl;
423  }
424  // Displaying timing result on the image
425  vpDisplay::displayText(I, 80, 20, oss_vec.str(), vpColor::red);
426  }
427 
428  // // UI display
429  if (opt_step_by_step) {
430  vpDisplay::displayText(I, 20, 20, "Left click to display next image", vpColor::red);
431  }
432  vpDisplay::displayText(I, 40, 20, "Right click to quit", vpColor::red);
433 
434  vpDisplay::flush(I);
436 
437  if (vpDisplay::getClick(I, button, opt_step_by_step)) {
438  if (button == vpMouseButton::button1) {
439  // Left click => next image
440  continue;
441  }
442  else if (button == vpMouseButton::button3) {
443  // Right click => stop the program
444  break;
445  }
446  }
447  }
448 
449  }
450  catch (const vpException &e) {
451  std::cout << e.what() << std::endl;
452  }
453 #else
454  (void)argc;
455  (void)argv;
456 #endif
457  return EXIT_SUCCESS;
458 }
static const vpColor red
Definition: vpColor.h:217
Structure containing the bounding box, expressed in pixels, confidence and class information about an...
void display(const vpImage< Type > &img, const vpColor &color=vpColor::blue, unsigned int thickness=1) const
std::optional< std::string > getClassName() const
Structure containing some information required for the configuration of a vpDetectorDNNOpenCV object.
void initFromJSON(const std::string &jsonPath)
DNNResultsParsingType
Enumeration listing the types of DNN for which the vpDetectorDNNOpenCV furnishes the methods permitti...
static DNNResultsParsingType dnnResultsParsingTypeFromString(const std::string &name)
const NetConfig & getNetConfig() const
void setNetConfig(const NetConfig &config)
virtual bool detect(const vpImage< unsigned char > &I, std::vector< DetectedFeatures2D > &output)
Object detection using OpenCV DNN module.
static std::string getAvailableDnnResultsParsingTypes()
Get the list of the parsing methods / types of DNNs supported by the vpDetectorDNNOpenCV class.
void saveConfigurationInJSON(const std::string &jsonPath) const
Save the network configuration in a JSON file.
Display for windows using GDI (available on any windows 32 platform).
Definition: vpDisplayGDI.h:130
The vpDisplayOpenCV allows to display image using the OpenCV library. Thus to enable this class OpenC...
Use the X11 console to display images on unix-like OS. Thus to enable this class X11 should be instal...
Definition: vpDisplayX.h:135
void init(vpImage< unsigned char > &I, int win_x=-1, int win_y=-1, const std::string &win_title="") VP_OVERRIDE
static bool getClick(const vpImage< unsigned char > &I, bool blocking=true)
virtual void setDownScalingFactor(unsigned int scale)
Definition: vpDisplay.cpp:233
static void display(const vpImage< unsigned char > &I)
static void setTitle(const vpImage< unsigned char > &I, const std::string &windowtitle)
static void flush(const vpImage< unsigned char > &I)
@ SCALE_AUTO
Definition: vpDisplay.h:184
static void displayText(const vpImage< unsigned char > &I, const vpImagePoint &ip, const std::string &s, const vpColor &color)
error that can be emitted by ViSP classes.
Definition: vpException.h:60
@ fatalError
Fatal error.
Definition: vpException.h:72
const char * what() const
Definition: vpException.cpp:71
static void convert(const vpImage< unsigned char > &src, vpImage< vpRGBa > &dest)
unsigned int getWidth() const
Definition: vpImage.h:242
unsigned int getSize() const
Definition: vpImage.h:221
unsigned int getHeight() const
Definition: vpImage.h:181
static std::string toLowerCase(const std::string &input)
Return a lower-case version of the string input . Numbers and special characters stay the same.
Definition: vpIoTools.cpp:1339
static bool checkFilename(const std::string &filename)
Definition: vpIoTools.cpp:786
static bool isNumber(const std::string &str)
Definition: vpMath.cpp:214
VISP_EXPORT double measureTimeMs()