Visual Servoing Platform  version 3.5.0 under development (2022-02-15)
tutorial-dnn-tensorrt-live.cpp
1 // This tutorial uses NVIDIA TensorRT inference framework to perform object detection.
2 // The object detection model is provided as a `.onnx` file. The model will be parsed
3 // and a GPU Inference Engine (GIE) will be created if it doesn't exist. This GIE is
4 // specific to the platform you're using.
5 //
6 // This tutorial was tested on NVIDIA Jetson TX2.
7 //
8 // The object detection model used is `SSD_Mobilenet V1` (Single Shot MultiBox Detector)
9 // pre-trained on PASCAL VOC dataset. It can detect 20 classes.
10 // For more information about the model, see this link:
11 //
12 // https://github.com/qfgaohao/pytorch-ssd
13 //
14 
15 #include <iostream>
16 #include <visp3/core/vpConfig.h>
17 
18 #if defined(VISP_HAVE_TENSORRT) && defined(VISP_HAVE_OPENCV)
19 #include <opencv2/opencv_modules.hpp>
20 #if defined(HAVE_OPENCV_CUDEV) && defined(HAVE_OPENCV_CUDAWARPING) && defined(HAVE_OPENCV_CUDAARITHM) && \
21  defined(VISP_HAVE_OPENCV_DNN)
22 #include <visp3/core/vpImageConvert.h>
23 #include <visp3/core/vpIoTools.h>
24 #include <visp3/gui/vpDisplayX.h>
25 
27 #include <opencv2/core/cuda.hpp>
28 #include <opencv2/cudaarithm.hpp>
29 #include <opencv2/cudawarping.hpp>
30 #include <opencv2/dnn.hpp>
32 
34 #include <cuda_runtime_api.h>
36 
38 #include <NvInfer.h>
39 #include <NvOnnxParser.h>
41 
42 #include <sys/stat.h>
43 
45 void preprocessImage(cv::Mat &img, float *gpu_input, const nvinfer1::Dims &dims, float meanR, float meanG, float meanB)
46 {
47  if (img.empty()) {
48  std::cerr << "Image is empty." << std::endl;
49  return;
50  }
51 
52  cv::cuda::GpuMat gpu_frame;
53  // Upload image to GPU
54  gpu_frame.upload(img);
55 
56  // input_dims is in NxCxHxW format.
57  auto input_width = dims.d[3];
58  auto input_height = dims.d[2];
59  auto channels = dims.d[1];
60  auto input_size = cv::Size(input_width, input_height);
61 
62  // Resize
63  cv::cuda::GpuMat resized;
64  cv::cuda::resize(gpu_frame, resized, input_size, 0, 0, cv::INTER_NEAREST);
65 
66  // Normalize
67  cv::cuda::GpuMat flt_image;
68  resized.convertTo(flt_image, CV_32FC3);
69  cv::cuda::subtract(flt_image, cv::Scalar(meanR, meanG, meanB), flt_image, cv::noArray(), -1);
70  cv::cuda::divide(flt_image, cv::Scalar(127.5f, 127.5f, 127.5f), flt_image, 1, -1);
71 
72  // To tensor
73  std::vector<cv::cuda::GpuMat> chw;
74  for (int i = 0; i < channels; ++i)
75  chw.emplace_back(cv::cuda::GpuMat(input_size, CV_32FC1, gpu_input + i * input_width * input_height));
76  cv::cuda::split(flt_image, chw);
77 }
79 
81 size_t getSizeByDim(const nvinfer1::Dims &dims)
82 {
83  size_t size = 1;
84  for (int i = 0; i < dims.nbDims; ++i)
85  size *= dims.d[i];
86  return size;
87 }
89 
91 std::vector<cv::Rect> postprocessResults(std::vector<void *> buffers, const std::vector<nvinfer1::Dims> &output_dims,
92  int batch_size, int image_width, int image_height, float confThresh,
93  float nmsThresh, std::vector<int> &classIds)
94 {
95  // private variables of vpDetectorDNN
96  std::vector<cv::Rect> m_boxes, m_boxesNMS;
97  std::vector<int> m_classIds;
98  std::vector<float> m_confidences;
99  std::vector<int> m_indices;
100 
101  // copy results from GPU to CPU
102  std::vector<std::vector<float> > cpu_outputs;
103  for (size_t i = 0; i < output_dims.size(); i++) {
104  cpu_outputs.push_back(std::vector<float>(getSizeByDim(output_dims[i]) * batch_size));
105  cudaMemcpy(cpu_outputs[i].data(), (float *)buffers[1 + i], cpu_outputs[i].size() * sizeof(float),
106  cudaMemcpyDeviceToHost);
107  }
108 
109  // post process
110  int N = output_dims[0].d[1], C = output_dims[0].d[2]; // (1 x N x C format); N: Number of output detection boxes
111  // (fixed in the model), C: Number of classes.
112  for (int i = 0; i < N; i++) // for all N (boxes)
113  {
114  uint32_t maxClass = 0;
115  float maxScore = -1000.0f;
116 
117  for (int j = 1; j < C; j++) // ignore background (classId = 0).
118  {
119  const float score = cpu_outputs[0][i * C + j];
120 
121  if (score < confThresh)
122  continue;
123 
124  if (score > maxScore) {
125  maxScore = score;
126  maxClass = j;
127  }
128  }
129 
130  if (maxScore > confThresh) {
131  int left = (int)(cpu_outputs[1][4 * i] * image_width);
132  int top = (int)(cpu_outputs[1][4 * i + 1] * image_height);
133  int right = (int)(cpu_outputs[1][4 * i + 2] * image_width);
134  int bottom = (int)(cpu_outputs[1][4 * i + 3] * image_height);
135  int width = right - left + 1;
136  int height = bottom - top + 1;
137 
138  m_boxes.push_back(cv::Rect(left, top, width, height));
139  m_classIds.push_back(maxClass);
140  m_confidences.push_back(maxScore);
141  }
142  }
143 
144  cv::dnn::NMSBoxes(m_boxes, m_confidences, confThresh, nmsThresh, m_indices);
145  m_boxesNMS.resize(m_indices.size());
146  for (size_t i = 0; i < m_indices.size(); ++i) {
147  int idx = m_indices[i];
148  m_boxesNMS[i] = m_boxes[idx];
149  }
150 
151  classIds = m_classIds; // Returning detected objects class Ids.
152  return m_boxesNMS;
153 }
155 
156 class Logger : public nvinfer1::ILogger
157 {
158 public:
159  void log(Severity severity, const char *msg) noexcept // override
160  {
161  if ((severity == Severity::kERROR) || (severity == Severity::kINTERNAL_ERROR) || (severity == Severity::kVERBOSE))
162  std::cout << msg << std::endl;
163  }
164 } gLogger;
165 
166 // destroy TensoRT objects if something goes wrong
167 struct TRTDestroy {
168  template <class T> void operator()(T *obj) const
169  {
170  if (obj)
171  obj->destroy();
172  }
173 };
174 
175 template <class T> using TRTUniquePtr = std::unique_ptr<T, TRTDestroy>;
176 
178 bool parseOnnxModel(const std::string &model_path, TRTUniquePtr<nvinfer1::ICudaEngine> &engine,
179  TRTUniquePtr<nvinfer1::IExecutionContext> &context)
181 {
182  // this section of code is from jetson-inference's `tensorNet`, to test if the GIE already exists.
183  char cache_prefix[FILENAME_MAX];
184  char cache_path[FILENAME_MAX];
185 
186  sprintf(cache_prefix, "%s", model_path.c_str());
187  sprintf(cache_path, "%s.engine", cache_prefix);
188 
189  std::cout << "attempting to open engine cache file " << cache_path << std::endl;
190 
192  if (vpIoTools::checkFilename(cache_path)) {
193  char *engineStream = NULL;
194  size_t engineSize = 0;
195 
196  // determine the file size of the engine
197  struct stat filestat;
198  stat(cache_path, &filestat);
199  engineSize = filestat.st_size;
200 
201  // allocate memory to hold the engine
202  engineStream = (char *)malloc(engineSize);
203 
204  // open the engine cache file from disk
205  FILE *cacheFile = NULL;
206  cacheFile = fopen(cache_path, "rb");
207 
208  // read the serialized engine into memory
209  const size_t bytesRead = fread(engineStream, 1, engineSize, cacheFile);
210 
211  if (bytesRead != engineSize) // Problem while deserializing.
212  {
213  std::cerr << "Error reading serialized engine into memory." << std::endl;
214  return false;
215  }
216 
217  // close the plan cache
218  fclose(cacheFile);
219 
220  // Recreate the inference runtime
221  TRTUniquePtr<nvinfer1::IRuntime> infer{nvinfer1::createInferRuntime(gLogger)};
222  engine.reset(infer->deserializeCudaEngine(engineStream, engineSize, NULL));
223  context.reset(engine->createExecutionContext());
224 
225  return true;
226  }
228 
230  else {
231  if (!vpIoTools::checkFilename(model_path)) {
232  std::cerr << "Could not parse ONNX model. File not found" << std::endl;
233  return false;
234  }
235 
236  TRTUniquePtr<nvinfer1::IBuilder> builder{nvinfer1::createInferBuilder(gLogger)};
237  TRTUniquePtr<nvinfer1::INetworkDefinition> network{
238  builder->createNetworkV2(1U << (uint32_t)nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)};
239  TRTUniquePtr<nvonnxparser::IParser> parser{nvonnxparser::createParser(*network, gLogger)};
240 
241  // parse ONNX
242  if (!parser->parseFromFile(model_path.c_str(), static_cast<int>(nvinfer1::ILogger::Severity::kINFO))) {
243  std::cerr << "ERROR: could not parse the model." << std::endl;
244  return false;
245  }
246 
247  TRTUniquePtr<nvinfer1::IBuilderConfig> config{builder->createBuilderConfig()};
248  // allow TRT to use up to 1GB of GPU memory for tactic selection
249  config->setMaxWorkspaceSize(32 << 20);
250  // use FP16 mode if possible
251  if (builder->platformHasFastFp16()) {
252  config->setFlag(nvinfer1::BuilderFlag::kFP16);
253  }
254 
255  builder->setMaxBatchSize(1);
256 
257  engine.reset(builder->buildEngineWithConfig(*network, *config));
258  context.reset(engine->createExecutionContext());
259 
260  TRTUniquePtr<nvinfer1::IHostMemory> serMem{engine->serialize()};
261 
262  if (!serMem) {
263  std::cout << "Failed to serialize CUDA engine." << std::endl;
264  return false;
265  }
266 
267  const char *serData = (char *)serMem->data();
268  const size_t serSize = serMem->size();
269 
270  // allocate memory to store the bitstream
271  char *engineMemory = (char *)malloc(serSize);
272 
273  if (!engineMemory) {
274  std::cout << "Failed to allocate memory to store CUDA engine." << std::endl;
275  return false;
276  }
277 
278  memcpy(engineMemory, serData, serSize);
279 
280  // write the cache file
281  FILE *cacheFile = NULL;
282  cacheFile = fopen(cache_path, "wb");
283 
284  fwrite(engineMemory, 1, serSize, cacheFile);
285  fclose(cacheFile);
286 
287  return true;
288  }
290 }
291 
292 int main(int argc, char **argv)
293 {
294  int opt_device = 0;
295  unsigned int opt_scale = 1;
296  std::string input = "";
297  std::string modelFile = vpIoTools::getViSPImagesDataPath() + "/dnn/object_detection/ssd-mobilenet.onnx";
298  std::string labelFile = vpIoTools::getViSPImagesDataPath() + "/dnn/object_detection/pascal-voc-labels.txt";
299  std::string config = "";
300  float meanR = 127.5f, meanG = 127.5f, meanB = 127.5f;
301  float confThresh = 0.5f;
302  float nmsThresh = 0.4f;
303 
304  for (int i = 1; i < argc; i++) {
305  if (std::string(argv[i]) == "--device" && i + 1 < argc) {
306  opt_device = atoi(argv[i + 1]);
307  } else if (std::string(argv[i]) == "--input" && i + 1 < argc) {
308  input = std::string(argv[i + 1]);
309  } else if (std::string(argv[i]) == "--model" && i + 1 < argc) {
310  modelFile = std::string(argv[i + 1]);
311  } else if (std::string(argv[i]) == "--config" && i + 1 < argc) {
312  config = std::string(argv[i + 1]);
313  } else if (std::string(argv[i]) == "--input-scale" && i + 1 < argc) {
314  opt_scale = atoi(argv[i + 1]);
315  } else if (std::string(argv[i]) == "--mean" && i + 3 < argc) {
316  meanR = atof(argv[i + 1]);
317  meanG = atof(argv[i + 2]);
318  meanB = atof(argv[i + 3]);
319  } else if (std::string(argv[i]) == "--confThresh" && i + 1 < argc) {
320  confThresh = (float)atof(argv[i + 1]);
321  } else if (std::string(argv[i]) == "--nmsThresh" && i + 1 < argc) {
322  nmsThresh = (float)atof(argv[i + 1]);
323  } else if (std::string(argv[i]) == "--labels" && i + 1 < argc) {
324  labelFile = std::string(argv[i + 1]);
325  } else if (std::string(argv[i]) == "--help" || std::string(argv[i]) == "-h") {
326  std::cout << argv[0]
327  << " [--device <camera device number>] [--input <path to image or video>"
328  " (camera is used if input is empty)] [--model <path to net trained weights>]"
329  " [--config <path to net config file>]"
330  " [--input-scale <input scale factor>] [--mean <meanR meanG meanB>]"
331  " [--confThresh <confidence threshold>]"
332  " [--nmsThresh <NMS threshold>] [--labels <path to label file>]"
333  << std::endl;
334  return EXIT_SUCCESS;
335  }
336  }
337 
338  std::string model_path(modelFile);
339  int batch_size = 1;
340 
341  std::vector<std::string> labels;
342  if (!labelFile.empty()) {
343  std::ifstream f_label(labelFile);
344  std::string line;
345  while (std::getline(f_label, line)) {
346  labels.push_back(line);
347  }
348  }
349 
351  // Parse the model and initialize the engine and the context.
352  TRTUniquePtr<nvinfer1::ICudaEngine> engine{nullptr};
353  TRTUniquePtr<nvinfer1::IExecutionContext> context{nullptr};
354  if (!parseOnnxModel(model_path, engine, context)) // Problem parsing Onnx model
355  {
356  std::cout << "Make sure the model file exists. To see available models, plese visit: "
357  "\n\twww.github.com/lagadic/visp-images/dnn/object_detection/"
358  << std::endl;
359  return 0;
360  }
362 
363  std::vector<nvinfer1::Dims> input_dims;
364  std::vector<nvinfer1::Dims> output_dims;
365  std::vector<void *> buffers(engine->getNbBindings()); // buffers for input and output data.
366 
368  for (int i = 0; i < engine->getNbBindings(); ++i) {
369  auto binding_size = getSizeByDim(engine->getBindingDimensions(i)) * batch_size * sizeof(float);
370  cudaMalloc(&buffers[i], binding_size);
371 
372  if (engine->bindingIsInput(i)) {
373  input_dims.emplace_back(engine->getBindingDimensions(i));
374  } else {
375  output_dims.emplace_back(engine->getBindingDimensions(i));
376  }
377  }
378 
379  if (input_dims.empty() || output_dims.empty()) {
380  std::cerr << "Expect at least one input and one output for network" << std::endl;
381  return -1;
382  }
384 
386  cv::VideoCapture capture;
387 
388  if (input.empty()) {
389  capture.open(opt_device);
390  } else {
391  capture.open(input);
392  }
393 
394  if (!capture.isOpened()) { // check if we succeeded
395  std::cout << "Failed to open the camera" << std::endl;
396  return EXIT_FAILURE;
397  }
398 
399  int cap_width = (int)capture.get(cv::CAP_PROP_FRAME_WIDTH);
400  int cap_height = (int)capture.get(cv::CAP_PROP_FRAME_HEIGHT);
401  capture.set(cv::CAP_PROP_FRAME_WIDTH, cap_width / opt_scale);
402  capture.set(cv::CAP_PROP_FRAME_HEIGHT, cap_height / opt_scale);
404 
405  vpImage<vpRGBa> I;
406  cv::Mat frame;
407  capture >> frame;
408 
409  if (input.empty()) {
410  int i = 0;
411  while ((i++ < 20) && !capture.read(frame)) {
412  }; // warm up camera by skiping unread frames
413  }
414 
415  vpImageConvert::convert(frame, I);
416  int height = I.getHeight(), width = I.getWidth();
417 
418  std::cout << "Image size: " << width << " x " << height << std::endl;
419 
420  std::vector<cv::Rect> boxesNMS;
421  std::vector<int> classIds;
422 
423  vpDisplayX d(I);
424 
425  double start, stop;
427  while (!vpDisplay::getClick(I, false)) {
428  // get frame.
429  capture >> frame;
430 
431  vpImageConvert::convert(frame, I);
432 
433  start = vpTime::measureTimeMs();
434  // preprocess
435  preprocessImage(frame, (float *)buffers[0], input_dims[0], meanR, meanG, meanB);
436 
437  // inference.
438  context->enqueue(batch_size, buffers.data(), 0, nullptr);
439 
440  // post-process
441  boxesNMS = postprocessResults(buffers, output_dims, batch_size, width, height, confThresh, nmsThresh, classIds);
442 
443  stop = vpTime::measureTimeMs();
444  // display.
446  vpDisplay::displayText(I, 10, 10, std::to_string(stop - start), vpColor::red);
447 
448  for (unsigned int i = 0; i < boxesNMS.size(); i++) {
449  vpDisplay::displayRectangle(I, vpRect(boxesNMS[i].x, boxesNMS[i].y, boxesNMS[i].width, boxesNMS[i].height),
450  vpColor::red, false, 2);
451  vpDisplay::displayText(I, boxesNMS[i].y - 10, boxesNMS[i].x, labels[classIds[i]], vpColor::red);
452  }
453 
454  vpDisplay::flush(I);
455  }
457 
458  for (void *buf : buffers)
459  cudaFree(buf);
460 
461  return EXIT_SUCCESS;
462 }
463 #else
464 int main()
465 {
466  std::cout << "OpenCV is not built with CUDA." << std::endl;
467 
468  return EXIT_SUCCESS;
469 }
470 #endif
471 
472 #else
473 int main()
474 {
475  std::cout << "ViSP is not built with TensorRT." << std::endl;
476 
477  return EXIT_SUCCESS;
478 }
479 #endif
static bool getClick(const vpImage< unsigned char > &I, bool blocking=true)
static std::string getViSPImagesDataPath()
Definition: vpIoTools.cpp:1365
static void convert(const vpImage< unsigned char > &src, vpImage< vpRGBa > &dest)
static void displayText(const vpImage< unsigned char > &I, const vpImagePoint &ip, const std::string &s, const vpColor &color)
Use the X11 console to display images on unix-like OS. Thus to enable this class X11 should be instal...
Definition: vpDisplayX.h:134
static void flush(const vpImage< unsigned char > &I)
VISP_EXPORT double measureTimeMs()
Definition: vpTime.cpp:126
static const vpColor red
Definition: vpColor.h:217
static void display(const vpImage< unsigned char > &I)
static void displayRectangle(const vpImage< unsigned char > &I, const vpImagePoint &topLeft, unsigned int width, unsigned int height, const vpColor &color, bool fill=false, unsigned int thickness=1)
unsigned int getHeight() const
Definition: vpImage.h:188
static bool checkFilename(const std::string &filename)
Definition: vpIoTools.cpp:802
Defines a rectangle in the plane.
Definition: vpRect.h:79
unsigned int getWidth() const
Definition: vpImage.h:246