Visual Servoing Platform  version 3.6.1 under development (2025-01-14)
tutorial-dnn-tensorrt-live.cpp
1 // This tutorial uses NVIDIA TensorRT inference framework to perform object detection.
2 // The object detection model is provided as a `.onnx` file. The model will be parsed
3 // and a GPU Inference Engine (GIE) will be created if it doesn't exist. This GIE is
4 // specific to the platform you're using.
5 //
6 // This tutorial was tested on NVIDIA Jetson TX2.
7 //
8 // The object detection model used is `SSD_Mobilenet V1` (Single Shot MultiBox Detector)
9 // pre-trained on PASCAL VOC dataset. It can detect 20 classes.
10 // For more information about the model, see this link:
11 //
12 // https://github.com/qfgaohao/pytorch-ssd
13 //
14 
15 #include <iostream>
16 
17 #include <visp3/core/vpConfig.h>
18 
19 #if defined(VISP_HAVE_TENSORRT) && defined(VISP_HAVE_OPENCV)
20 
21 #if defined(HAVE_OPENCV_CUDEV) && defined(HAVE_OPENCV_CUDAWARPING) && defined(HAVE_OPENCV_CUDAARITHM) && \
22  defined(HAVE_OPENCV_DNN) && defined(HAVE_OPENCV_VIDEOIO)
23 #include <visp3/core/vpImageConvert.h>
24 #include <visp3/core/vpIoTools.h>
25 #include <visp3/gui/vpDisplayX.h>
26 
27 #include <opencv2/videoio.hpp>
28 
30 #include <opencv2/core/cuda.hpp>
31 #include <opencv2/cudaarithm.hpp>
32 #include <opencv2/cudawarping.hpp>
33 #include <opencv2/dnn.hpp>
35 
37 #include <cuda_runtime_api.h>
39 
41 #include <NvInfer.h>
42 #include <NvOnnxParser.h>
44 
45 #include <sys/stat.h>
46 
47 #ifdef ENABLE_VISP_NAMESPACE
48 using namespace VISP_NAMESPACE_NAME;
49 #endif
50 
52 void preprocessImage(cv::Mat &img, float *gpu_input, const nvinfer1::Dims &dims, float meanR, float meanG, float meanB)
53 {
54  if (img.empty()) {
55  std::cerr << "Image is empty." << std::endl;
56  return;
57  }
58 
59  cv::cuda::GpuMat gpu_frame;
60  // Upload image to GPU
61  gpu_frame.upload(img);
62 
63  // input_dims is in NxCxHxW format.
64  auto input_width = dims.d[3];
65  auto input_height = dims.d[2];
66  auto channels = dims.d[1];
67  auto input_size = cv::Size(input_width, input_height);
68 
69  // Resize
70  cv::cuda::GpuMat resized;
71  cv::cuda::resize(gpu_frame, resized, input_size, 0, 0, cv::INTER_NEAREST);
72 
73  // Normalize
74  cv::cuda::GpuMat flt_image;
75  resized.convertTo(flt_image, CV_32FC3);
76  cv::cuda::subtract(flt_image, cv::Scalar(meanR, meanG, meanB), flt_image, cv::noArray(), -1);
77  cv::cuda::divide(flt_image, cv::Scalar(127.5f, 127.5f, 127.5f), flt_image, 1, -1);
78 
79  // To tensor
80  std::vector<cv::cuda::GpuMat> chw;
81  for (int i = 0; i < channels; ++i)
82  chw.emplace_back(cv::cuda::GpuMat(input_size, CV_32FC1, gpu_input + i * input_width * input_height));
83  cv::cuda::split(flt_image, chw);
84 }
86 
88 size_t getSizeByDim(const nvinfer1::Dims &dims)
89 {
90  size_t size = 1;
91  for (int i = 0; i < dims.nbDims; ++i)
92  size *= dims.d[i];
93  return size;
94 }
96 
98 std::vector<cv::Rect> postprocessResults(std::vector<void *> buffers, const std::vector<nvinfer1::Dims> &output_dims,
99  int batch_size, int image_width, int image_height, float confThresh,
100  float nmsThresh, std::vector<int> &classIds)
101 {
102  // private variables of vpDetectorDNN
103  std::vector<cv::Rect> m_boxes, m_boxesNMS;
104  std::vector<int> m_classIds;
105  std::vector<float> m_confidences;
106  std::vector<int> m_indices;
107 
108  // copy results from GPU to CPU
109  std::vector<std::vector<float> > cpu_outputs;
110  for (size_t i = 0; i < output_dims.size(); i++) {
111  cpu_outputs.push_back(std::vector<float>(getSizeByDim(output_dims[i]) * batch_size));
112  cudaMemcpy(cpu_outputs[i].data(), (float *)buffers[1 + i], cpu_outputs[i].size() * sizeof(float),
113  cudaMemcpyDeviceToHost);
114  }
115 
116  // post process
117  int N = output_dims[0].d[1], C = output_dims[0].d[2]; // (1 x N x C format); N: Number of output detection boxes
118  // (fixed in the model), C: Number of classes.
119  for (int i = 0; i < N; i++) // for all N (boxes)
120  {
121  uint32_t maxClass = 0;
122  float maxScore = -1000.0f;
123 
124  for (int j = 1; j < C; j++) // ignore background (classId = 0).
125  {
126  const float score = cpu_outputs[0][i * C + j];
127 
128  if (score < confThresh)
129  continue;
130 
131  if (score > maxScore) {
132  maxScore = score;
133  maxClass = j;
134  }
135  }
136 
137  if (maxScore > confThresh) {
138  int left = (int)(cpu_outputs[1][4 * i] * image_width);
139  int top = (int)(cpu_outputs[1][4 * i + 1] * image_height);
140  int right = (int)(cpu_outputs[1][4 * i + 2] * image_width);
141  int bottom = (int)(cpu_outputs[1][4 * i + 3] * image_height);
142  int width = right - left + 1;
143  int height = bottom - top + 1;
144 
145  m_boxes.push_back(cv::Rect(left, top, width, height));
146  m_classIds.push_back(maxClass);
147  m_confidences.push_back(maxScore);
148  }
149  }
150 
151  cv::dnn::NMSBoxes(m_boxes, m_confidences, confThresh, nmsThresh, m_indices);
152  m_boxesNMS.resize(m_indices.size());
153  for (size_t i = 0; i < m_indices.size(); ++i) {
154  int idx = m_indices[i];
155  m_boxesNMS[i] = m_boxes[idx];
156  }
157 
158  classIds = m_classIds; // Returning detected objects class Ids.
159  return m_boxesNMS;
160 }
162 
163 class Logger : public nvinfer1::ILogger
164 {
165 public:
166  void log(Severity severity, const char *msg) noexcept // override
167  {
168  if ((severity == Severity::kERROR) || (severity == Severity::kINTERNAL_ERROR) || (severity == Severity::kVERBOSE))
169  std::cout << msg << std::endl;
170  }
171 } gLogger;
172 
173 // destroy TensoRT objects if something goes wrong
174 struct TRTDestroy
175 {
176  template <class T> void operator()(T *obj) const
177  {
178  if (obj)
179  obj->destroy();
180  }
181 };
182 
183 template <class T> using TRTUniquePtr = std::unique_ptr<T, TRTDestroy>;
184 
186 bool parseOnnxModel(const std::string &model_path, TRTUniquePtr<nvinfer1::ICudaEngine> &engine,
187  TRTUniquePtr<nvinfer1::IExecutionContext> &context)
189 {
190  // this section of code is from jetson-inference's `tensorNet`, to test if the GIE already exists.
191  char cache_prefix[FILENAME_MAX];
192  char cache_path[FILENAME_MAX];
193 
194  snprintf(cache_prefix, FILENAME_MAX, "%s", model_path.c_str());
195  snprintf(cache_path, FILENAME_MAX, "%s.engine", cache_prefix);
196 
197  std::cout << "attempting to open engine cache file " << cache_path << std::endl;
198 
200  if (vpIoTools::checkFilename(cache_path)) {
201  char *engineStream = nullptr;
202  size_t engineSize = 0;
203 
204  // determine the file size of the engine
205  struct stat filestat;
206  stat(cache_path, &filestat);
207  engineSize = filestat.st_size;
208 
209  // allocate memory to hold the engine
210  engineStream = (char *)malloc(engineSize);
211 
212  // open the engine cache file from disk
213  FILE *cacheFile = nullptr;
214  cacheFile = fopen(cache_path, "rb");
215 
216  // read the serialized engine into memory
217  const size_t bytesRead = fread(engineStream, 1, engineSize, cacheFile);
218 
219  if (bytesRead != engineSize) // Problem while deserializing.
220  {
221  std::cerr << "Error reading serialized engine into memory." << std::endl;
222  return false;
223  }
224 
225  // close the plan cache
226  fclose(cacheFile);
227 
228  // Recreate the inference runtime
229  TRTUniquePtr<nvinfer1::IRuntime> infer { nvinfer1::createInferRuntime(gLogger) };
230  engine.reset(infer->deserializeCudaEngine(engineStream, engineSize, nullptr));
231  context.reset(engine->createExecutionContext());
232 
233  return true;
234  }
236 
238  else {
239  if (!vpIoTools::checkFilename(model_path)) {
240  std::cerr << "Could not parse ONNX model. File not found" << std::endl;
241  return false;
242  }
243 
244  TRTUniquePtr<nvinfer1::IBuilder> builder { nvinfer1::createInferBuilder(gLogger) };
245  TRTUniquePtr<nvinfer1::INetworkDefinition> network {
246  builder->createNetworkV2(1U << (uint32_t)nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH) };
247  TRTUniquePtr<nvonnxparser::IParser> parser { nvonnxparser::createParser(*network, gLogger) };
248 
249  // parse ONNX
250  if (!parser->parseFromFile(model_path.c_str(), static_cast<int>(nvinfer1::ILogger::Severity::kINFO))) {
251  std::cerr << "ERROR: could not parse the model." << std::endl;
252  return false;
253  }
254 
255  TRTUniquePtr<nvinfer1::IBuilderConfig> config { builder->createBuilderConfig() };
256  // allow TRT to use up to 1GB of GPU memory for tactic selection
257  config->setMaxWorkspaceSize(32 << 20);
258  // use FP16 mode if possible
259  if (builder->platformHasFastFp16()) {
260  config->setFlag(nvinfer1::BuilderFlag::kFP16);
261  }
262 
263  builder->setMaxBatchSize(1);
264 
265  engine.reset(builder->buildEngineWithConfig(*network, *config));
266  context.reset(engine->createExecutionContext());
267 
268  TRTUniquePtr<nvinfer1::IHostMemory> serMem { engine->serialize() };
269 
270  if (!serMem) {
271  std::cout << "Failed to serialize CUDA engine." << std::endl;
272  return false;
273  }
274 
275  const char *serData = (char *)serMem->data();
276  const size_t serSize = serMem->size();
277 
278  // allocate memory to store the bitstream
279  char *engineMemory = (char *)malloc(serSize);
280 
281  if (!engineMemory) {
282  std::cout << "Failed to allocate memory to store CUDA engine." << std::endl;
283  return false;
284  }
285 
286  memcpy(engineMemory, serData, serSize);
287 
288  // write the cache file
289  FILE *cacheFile = nullptr;
290  cacheFile = fopen(cache_path, "wb");
291 
292  fwrite(engineMemory, 1, serSize, cacheFile);
293  fclose(cacheFile);
294 
295  return true;
296  }
298 }
299 
300 int main(int argc, char **argv)
301 {
302  int opt_device = 0;
303  unsigned int opt_scale = 1;
304  std::string input = "";
305  std::string modelFile = vpIoTools::getViSPImagesDataPath() + "/dnn/object_detection/ssd-mobilenet.onnx";
306  std::string labelFile = vpIoTools::getViSPImagesDataPath() + "/dnn/object_detection/pascal-voc-labels.txt";
307  std::string config = "";
308  float meanR = 127.5f, meanG = 127.5f, meanB = 127.5f;
309  float confThresh = 0.5f;
310  float nmsThresh = 0.4f;
311 
312  for (int i = 1; i < argc; i++) {
313  if (std::string(argv[i]) == "--device" && i + 1 < argc) {
314  opt_device = atoi(argv[i + 1]);
315  }
316  else if (std::string(argv[i]) == "--input" && i + 1 < argc) {
317  input = std::string(argv[i + 1]);
318  }
319  else if (std::string(argv[i]) == "--model" && i + 1 < argc) {
320  modelFile = std::string(argv[i + 1]);
321  }
322  else if (std::string(argv[i]) == "--config" && i + 1 < argc) {
323  config = std::string(argv[i + 1]);
324  }
325  else if (std::string(argv[i]) == "--input-scale" && i + 1 < argc) {
326  opt_scale = atoi(argv[i + 1]);
327  }
328  else if (std::string(argv[i]) == "--mean" && i + 3 < argc) {
329  meanR = atof(argv[i + 1]);
330  meanG = atof(argv[i + 2]);
331  meanB = atof(argv[i + 3]);
332  }
333  else if (std::string(argv[i]) == "--confThresh" && i + 1 < argc) {
334  confThresh = (float)atof(argv[i + 1]);
335  }
336  else if (std::string(argv[i]) == "--nmsThresh" && i + 1 < argc) {
337  nmsThresh = (float)atof(argv[i + 1]);
338  }
339  else if (std::string(argv[i]) == "--labels" && i + 1 < argc) {
340  labelFile = std::string(argv[i + 1]);
341  }
342  else if (std::string(argv[i]) == "--help" || std::string(argv[i]) == "-h") {
343  std::cout << argv[0]
344  << " [--device <camera device number>] [--input <path to image or video>"
345  " (camera is used if input is empty)] [--model <path to net trained weights>]"
346  " [--config <path to net config file>]"
347  " [--input-scale <input scale factor>] [--mean <meanR meanG meanB>]"
348  " [--confThresh <confidence threshold>]"
349  " [--nmsThresh <NMS threshold>] [--labels <path to label file>]"
350  << std::endl;
351  return EXIT_SUCCESS;
352  }
353  }
354 
355  std::string model_path(modelFile);
356  int batch_size = 1;
357 
358  std::vector<std::string> labels;
359  if (!labelFile.empty()) {
360  std::ifstream f_label(labelFile);
361  std::string line;
362  while (std::getline(f_label, line)) {
363  labels.push_back(line);
364  }
365  }
366 
368  // Parse the model and initialize the engine and the context.
369  TRTUniquePtr<nvinfer1::ICudaEngine> engine { nullptr };
370  TRTUniquePtr<nvinfer1::IExecutionContext> context { nullptr };
371  if (!parseOnnxModel(model_path, engine, context)) // Problem parsing Onnx model
372  {
373  std::cout << "Make sure the model file exists. To see available models, plese visit: "
374  "\n\twww.github.com/lagadic/visp-images/dnn/object_detection/"
375  << std::endl;
376  return EXIT_FAILURE;
377  }
379 
380  std::vector<nvinfer1::Dims> input_dims;
381  std::vector<nvinfer1::Dims> output_dims;
382  std::vector<void *> buffers(engine->getNbBindings()); // buffers for input and output data.
383 
385  for (int i = 0; i < engine->getNbBindings(); ++i) {
386  auto binding_size = getSizeByDim(engine->getBindingDimensions(i)) * batch_size * sizeof(float);
387  cudaMalloc(&buffers[i], binding_size);
388 
389  if (engine->bindingIsInput(i)) {
390  input_dims.emplace_back(engine->getBindingDimensions(i));
391  }
392  else {
393  output_dims.emplace_back(engine->getBindingDimensions(i));
394  }
395  }
396 
397  if (input_dims.empty() || output_dims.empty()) {
398  std::cerr << "Expect at least one input and one output for network" << std::endl;
399  return EXIT_FAILURE;
400  }
402 
404  cv::VideoCapture capture;
405 
406  if (input.empty()) {
407  capture.open(opt_device);
408  }
409  else {
410  capture.open(input);
411  }
412 
413  if (!capture.isOpened()) { // check if we succeeded
414  std::cout << "Failed to open the camera" << std::endl;
415  return EXIT_FAILURE;
416  }
417 
418  int cap_width = (int)capture.get(cv::CAP_PROP_FRAME_WIDTH);
419  int cap_height = (int)capture.get(cv::CAP_PROP_FRAME_HEIGHT);
420  capture.set(cv::CAP_PROP_FRAME_WIDTH, cap_width / opt_scale);
421  capture.set(cv::CAP_PROP_FRAME_HEIGHT, cap_height / opt_scale);
423 
424  vpImage<vpRGBa> I;
425  cv::Mat frame;
426  capture >> frame;
427 
428  if (input.empty()) {
429  int i = 0;
430  while ((i++ < 20) && !capture.read(frame)) {
431  }; // warm up camera by skiping unread frames
432  }
433 
434  vpImageConvert::convert(frame, I);
435  int height = I.getHeight(), width = I.getWidth();
436 
437  std::cout << "Image size: " << width << " x " << height << std::endl;
438 
439  std::vector<cv::Rect> boxesNMS;
440  std::vector<int> classIds;
441 
442  vpDisplayX d(I);
443 
444  double start, stop;
446  while (!vpDisplay::getClick(I, false)) {
447  // get frame.
448  capture >> frame;
449 
450  vpImageConvert::convert(frame, I);
451 
452  start = vpTime::measureTimeMs();
453  // preprocess
454  preprocessImage(frame, (float *)buffers[0], input_dims[0], meanR, meanG, meanB);
455 
456  // inference.
457  context->enqueue(batch_size, buffers.data(), 0, nullptr);
458 
459  // post-process
460  boxesNMS = postprocessResults(buffers, output_dims, batch_size, width, height, confThresh, nmsThresh, classIds);
461 
462  stop = vpTime::measureTimeMs();
463  // display.
465  vpDisplay::displayText(I, 10, 10, std::to_string(stop - start), vpColor::red);
466 
467  for (unsigned int i = 0; i < boxesNMS.size(); i++) {
468  vpDisplay::displayRectangle(I, vpRect(boxesNMS[i].x, boxesNMS[i].y, boxesNMS[i].width, boxesNMS[i].height),
469  vpColor::red, false, 2);
470  vpDisplay::displayText(I, boxesNMS[i].y - 10, boxesNMS[i].x, labels[classIds[i]], vpColor::red);
471  }
472 
473  vpDisplay::flush(I);
474  }
476 
477  for (void *buf : buffers)
478  cudaFree(buf);
479 
480  return EXIT_SUCCESS;
481 }
482 #else
483 int main()
484 {
485  std::cout << "OpenCV is not built with CUDA." << std::endl;
486 
487  return EXIT_SUCCESS;
488 }
489 #endif
490 
491 #else
492 int main()
493 {
494  std::cout << "ViSP is not built with TensorRT." << std::endl;
495 
496  return EXIT_SUCCESS;
497 }
498 #endif
static const vpColor red
Definition: vpColor.h:217
static bool getClick(const vpImage< unsigned char > &I, bool blocking=true)
static void display(const vpImage< unsigned char > &I)
static void flush(const vpImage< unsigned char > &I)
static void displayRectangle(const vpImage< unsigned char > &I, const vpImagePoint &topLeft, unsigned int width, unsigned int height, const vpColor &color, bool fill=false, unsigned int thickness=1)
static void displayText(const vpImage< unsigned char > &I, const vpImagePoint &ip, const std::string &s, const vpColor &color)
static void convert(const vpImage< unsigned char > &src, vpImage< vpRGBa > &dest)
unsigned int getWidth() const
Definition: vpImage.h:242
unsigned int getHeight() const
Definition: vpImage.h:181
static std::string getViSPImagesDataPath()
Definition: vpIoTools.cpp:1053
static bool checkFilename(const std::string &filename)
Definition: vpIoTools.cpp:786
Defines a rectangle in the plane.
Definition: vpRect.h:79
VISP_EXPORT double measureTimeMs()