Visual Servoing Platform  version 3.6.1 under development (2024-05-17)
tutorial-megapose-live-single-object-tracking.cpp
1 #include <iostream>
3 
4 #include <visp3/core/vpConfig.h>
5 
6 // Check if std:c++17 or higher
7 #if ((__cplusplus >= 201703L) || (defined(_MSVC_LANG) && (_MSVC_LANG >= 201703L))) && \
8  defined(VISP_HAVE_NLOHMANN_JSON) && defined(VISP_HAVE_OPENCV) && defined(HAVE_OPENCV_VIDEOIO) && \
9  defined(HAVE_OPENCV_DNN) && (defined(VISP_HAVE_X11) || defined(VISP_HAVE_GDI) || defined(HAVE_OPENCV_HIGHGUI)) && \
10  defined(VISP_HAVE_THREADS)
11 
12 #include <optional>
13 
14 #include <visp3/core/vpIoTools.h>
15 #include <visp3/detection/vpDetectorDNNOpenCV.h>
16 #include <visp3/gui/vpDisplayGDI.h>
17 #include <visp3/gui/vpDisplayOpenCV.h>
18 #include <visp3/gui/vpDisplayX.h>
19 #include <visp3/dnn_tracker/vpMegaPose.h>
20 #include <visp3/dnn_tracker/vpMegaPoseTracker.h>
21 #include <visp3/io/vpJsonArgumentParser.h>
22 
23 #include <nlohmann/json.hpp>
24 
25 #include <opencv2/videoio.hpp>
26 
27 
28 using json = nlohmann::json;
29 
30 /*
31  * Interpolate two vpColors. Linear interpolation between each components (R, G, B)
32  *
33  * low starting color
34  * high ending color
35  * f interpolation factor, between 0 and 1
36  * Returns the interpolated color
37  */
38 vpColor interpolate(const vpColor &low, const vpColor &high, const float f)
39 {
40  const float r = ((float)high.R - (float)low.R) * f;
41  const float g = ((float)high.G - (float)low.G) * f;
42  const float b = ((float)high.B - (float)low.B) * f;
43  return vpColor((unsigned char)r, (unsigned char)g, (unsigned char)b);
44 }
45 
46 /*
47  * Display the Megapose confidence score as a rectangle in the image.
48  * This rectangle becomes green when Megapose is "confident" about its prediction
49  * The confidence score measures whether Megapose can, from its pose estimation, recover the true pose in future pose refinement iterations
50  *
51  * \param[in] I : The image in which to display the confidence.
52  * \param[in] score : The confidence score of Megapose, between 0 and 1.
53  */
54 void displayScore(const vpImage<vpRGBa> &I, float score)
55 {
56  const unsigned top = static_cast<unsigned>(I.getHeight() * 0.85f);
57  const unsigned height = static_cast<unsigned>(I.getHeight() * 0.1f);
58  const unsigned left = static_cast<unsigned>(I.getWidth() * 0.05f);
59  const unsigned width = static_cast<unsigned>(I.getWidth() * 0.5f);
60  vpRect full(left, top, width, height);
61  vpRect scoreRect(left, top, width * score, height);
62  const vpColor low = vpColor::red;
63  const vpColor high = vpColor::green;
64  const vpColor c = interpolate(low, high, score);
65 
66  vpDisplay::displayRectangle(I, full, c, false, 5);
67  vpDisplay::displayRectangle(I, scoreRect, c, true, 1);
68 }
69 
70 /*
71  * Add the Megapose rendering on top of the actual image I.
72  * Require I and overlay to be of the same size.
73  * Note that a fully black object will not render
74 */
75 void overlayRender(vpImage<vpRGBa> &I, const vpImage<vpRGBa> &overlay)
76 {
77  const vpRGBa black = vpRGBa(0, 0, 0);
78  for (unsigned int i = 0; i < I.getHeight(); ++i) {
79  for (unsigned int j = 0; j < I.getWidth(); ++j) {
80  if (overlay[i][j] != black) {
81  I[i][j] = overlay[i][j];
82  }
83  }
84  }
85 }
86 
88 /*
89  * Run the detection network on an image in order to find a specific object.
90  * The best matching detection is returned:
91  * - If a previous Megapose estimation is available, find the closest match in the image (Euclidean distance between centers)
92  * - Otherwise, take the detection with highest confidence
93  * If no detection corresponding to detectionLabel is found, then std::nullopt is returned
94  */
95 std::optional<vpRect> detectObjectForInitMegaposeDnn(vpDetectorDNNOpenCV &detector, const cv::Mat &I,
96  const std::string &detectionLabel,
97  std::optional<vpMegaPoseEstimate> previousEstimate)
98 {
99  std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D> detections_vec;
100  detector.detect(I, detections_vec);
101  std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D> matchingDetections;
102  for (const auto &detection : detections_vec) {
103  std::optional<std::string> classnameOpt = detection.getClassName();
104  if (classnameOpt) {
105  if (*classnameOpt == detectionLabel) {
106  matchingDetections.push_back(detection);
107  }
108  }
109  }
110  if (matchingDetections.size() == 0) {
111  return std::nullopt;
112  }
113  else if (matchingDetections.size() == 1) {
114  return matchingDetections[0].getBoundingBox();
115  }
116  else {
117  // Get detection that is closest to previous object bounding box estimated by Megapose
118  if (previousEstimate) {
119  vpRect best;
120  double bestDist = 10000.f;
121  const vpImagePoint previousCenter = (*previousEstimate).boundingBox.getCenter();
122  for (const auto &detection : matchingDetections) {
123  const vpRect detectionBB = detection.getBoundingBox();
124  const vpImagePoint center = detectionBB.getCenter();
125  const double matchDist = vpImagePoint::distance(center, previousCenter);
126  if (matchDist < bestDist) {
127  bestDist = matchDist;
128  best = detectionBB;
129  }
130  }
131  return best;
132 
133  }
134  else { // Get detection with highest confidence
135  vpRect best;
136  double highestConf = 0.0;
137  for (const auto &detection : matchingDetections) {
138  const double conf = detection.getConfidenceScore();
139  if (conf > highestConf) {
140  highestConf = conf;
141  best = detection.getBoundingBox();
142  }
143  }
144  return best;
145  }
146  }
147  return std::nullopt;
148 }
149 
150 /*
151  * Ask user to provide the detection themselves. They must click to start labelling, then click on the top left and bottom right corner to create the detection.
152  */
153 std::optional<vpRect> detectObjectForInitMegaposeClick(const vpImage<vpRGBa> &I)
154 {
155  const bool startLabelling = vpDisplay::getClick(I, false);
156 
157  const vpImagePoint textPosition(10.0, 20.0);
158 
159  if (startLabelling) {
160  vpImagePoint topLeft, bottomRight;
161  vpDisplay::displayText(I, textPosition, "Click the upper left corner of the bounding box", vpColor::red);
162  vpDisplay::flush(I);
163  vpDisplay::getClick(I, topLeft, true);
165  vpDisplay::displayCross(I, topLeft, 5, vpColor::red, 2);
166  vpDisplay::displayText(I, textPosition, "Click the bottom right corner of the bounding box", vpColor::red);
167  vpDisplay::flush(I);
168  vpDisplay::getClick(I, bottomRight, true);
169  vpRect bb(topLeft, bottomRight);
170  return bb;
171  }
172  else {
174  vpDisplay::displayText(I, textPosition, "Click when the object is visible and static to start reinitializing megapose.", vpColor::red);
175  vpDisplay::flush(I);
176  return std::nullopt;
177  }
178 }
180 
181 enum DetectionMethod
182 {
183  UNKNOWN,
184  CLICK,
185  DNN
186 };
187 
188 NLOHMANN_JSON_SERIALIZE_ENUM(DetectionMethod, {
189  {UNKNOWN, nullptr}, // Default value if the json string is not in "current", "desired" or "mean"
190  {CLICK, "click"},
191  {DNN, "dnn"} }
192 );
193 
194 
195 int main(int argc, const char *argv[])
196 {
197  unsigned width = 640, height = 480;
198  vpCameraParameters cam;
199  std::string videoDevice = "0";
200  std::string megaposeAddress = "127.0.0.1";
201  unsigned megaposePort = 5555;
202  int refinerIterations = 1, coarseNumSamples = 576;
203  double reinitThreshold = 0.2;
204 
205  DetectionMethod detectionMethod = DetectionMethod::UNKNOWN;
206 
207  std::string detectorModelPath = "path/to/model.onnx", detectorConfig = "none";
208  std::string detectorFramework = "onnx", detectorTypeString = "yolov7";
209  std::string objectName = "cube";
210  std::vector<std::string> labels = { "cube" };
211  float detectorMeanR = 0.f, detectorMeanG = 0.f, detectorMeanB = 0.f;
212  float detectorConfidenceThreshold = 0.65f, detectorNmsThreshold = 0.5f, detectorFilterThreshold = -0.25f;
213  float detectorScaleFactor = 0.0039f;
214  bool detectorSwapRB = false;
216  vpJsonArgumentParser parser("Single object tracking with Megapose", "--config", "/");
217  parser.addArgument("width", width, true, "The image width")
218  .addArgument("height", height, true, "The image height")
219  .addArgument("camera", cam, true, "The camera intrinsic parameters. Should correspond to a perspective projection model without distortion.")
220  .addArgument("video-device", videoDevice, true, "Video device")
221  .addArgument("object", objectName, true, "Name of the object to track with megapose.")
222  .addArgument("detectionMethod", detectionMethod, true, "How to perform detection of the object to get the bounding box:"
223  " \"click\" for user labelling, \"dnn\" for dnn detection.")
224  .addArgument("reinitThreshold", reinitThreshold, false, "If the Megapose score falls below this threshold, then a reinitialization is be required."
225  " Should be between 0 and 1")
226  .addArgument("megapose/address", megaposeAddress, true, "IP address of the Megapose server.")
227  .addArgument("megapose/port", megaposePort, true, "Port on which the Megapose server listens for connections.")
228  .addArgument("megapose/refinerIterations", refinerIterations, false, "Number of Megapose refiner model iterations."
229  "A higher count may lead to better accuracy, at the cost of more processing time")
230  .addArgument("megapose/initialisationNumSamples", coarseNumSamples, false, "Number of Megapose renderings used for the initial pose estimation.")
231 
232  .addArgument("detector/model-path", detectorModelPath, true, "Path to the model")
233  .addArgument("detector/config", detectorConfig, true, "Path to the model configuration. Set to none if config is not required.")
234  .addArgument("detector/framework", detectorFramework, true, "Detector framework")
235  .addArgument("detector/type", detectorTypeString, true, "Detector type")
236  .addArgument("detector/labels", labels, true, "Detection class labels")
237  .addArgument("detector/mean/red", detectorMeanR, false, "Detector mean red component. Used to normalize image")
238  .addArgument("detector/mean/green", detectorMeanG, false, "Detector mean green component. Used to normalize image")
239  .addArgument("detector/mean/blue", detectorMeanB, false, "Detector mean red component. Used to normalize image")
240  .addArgument("detector/confidenceThreshold", detectorConfidenceThreshold, false, "Detector confidence threshold. "
241  "When a detection with a confidence below this threshold, it is ignored")
242  .addArgument("detector/nmsThreshold", detectorNmsThreshold, false, "Detector non maximal suppression threshold.")
243  .addArgument("detector/filterThreshold", detectorFilterThreshold, false)
244  .addArgument("detector/scaleFactor", detectorScaleFactor, false, "Pixel intensity rescaling factor. If set to 1/255, then pixel values are between 0 and 1.")
245  .addArgument("detector/swapRedAndBlue", detectorSwapRB, false, "Whether to swap red and blue channels before feeding the image to the detector.");
246 
247  parser.parse(argc, argv);
249 
251  throw vpException(vpException::badValue, "The camera projection model should be without distortion, as other models are ignored by Megapose");
252  }
253 
254  if (detectionMethod == DetectionMethod::UNKNOWN) {
255  throw vpException(vpException::badValue, "The specified detection method is incorrect: it should be either \"click\" or \"dnn\"");
256  }
257 
258  cv::VideoCapture capture;
259  bool isLiveCapture;
260  bool hasCaptureOpeningSucceeded;
261  double videoFrametime = 0; // Only for prerecorded videos
262  if (vpMath::isNumber(videoDevice)) {
263  hasCaptureOpeningSucceeded = capture.open(std::atoi(videoDevice.c_str()));
264  isLiveCapture = true;
265  }
266  else {
267  hasCaptureOpeningSucceeded = capture.open(videoDevice);
268  isLiveCapture = false;
269  double fps = capture.get(cv::CAP_PROP_FPS);
270  videoFrametime = (1.0 / fps) * 1000.0;
271  }
272  if (!hasCaptureOpeningSucceeded) {
273  std::cout << "Capture from camera: " << videoDevice << " didn't work" << std::endl;
274  return EXIT_FAILURE;
275  }
276 
277  vpImage<vpRGBa> I;
278 #if defined(VISP_HAVE_X11)
279  vpDisplayX d;
280 #elif defined(VISP_HAVE_GDI)
281  vpDisplayGDI d;
282 #elif defined(HAVE_OPENCV_HIGHGUI)
283  vpDisplayOpenCV d;
284 #endif
285  //d.setDownScalingFactor(vpDisplay::SCALE_AUTO);
286 #if (VISP_HAVE_OPENCV_VERSION >= 0x030403) && defined(HAVE_OPENCV_DNN) && \
287  ((__cplusplus >= 201703L) || (defined(_MSVC_LANG) && (_MSVC_LANG >= 201703L)))
288  vpDetectorDNNOpenCV::DNNResultsParsingType detectorType =
289  vpDetectorDNNOpenCV::dnnResultsParsingTypeFromString(detectorTypeString);
290  vpDetectorDNNOpenCV::NetConfig netConfig(detectorConfidenceThreshold, detectorNmsThreshold, labels,
291  cv::Size(width, height), detectorFilterThreshold);
292  vpDetectorDNNOpenCV dnn(netConfig, detectorType);
293  if (detectionMethod == DetectionMethod::DNN) {
294  dnn.readNet(detectorModelPath, detectorConfig, detectorFramework);
295  dnn.setMean(detectorMeanR, detectorMeanG, detectorMeanB);
296  dnn.setScaleFactor(detectorScaleFactor);
297  dnn.setSwapRB(detectorSwapRB);
298  }
299 #endif
301  std::shared_ptr<vpMegaPose> megapose;
302  try {
303  megapose = std::make_shared<vpMegaPose>(megaposeAddress, megaposePort, cam, height, width);
304  }
305  catch (...) {
306  throw vpException(vpException::ioError, "Could not connect to Megapose server at " + megaposeAddress + " on port " + std::to_string(megaposePort));
307  }
308 
309  vpMegaPoseTracker megaposeTracker(megapose, objectName, refinerIterations);
310  megapose->setCoarseNumSamples(coarseNumSamples);
311  const std::vector<std::string> allObjects = megapose->getObjectNames();
312  if (std::find(allObjects.begin(), allObjects.end(), objectName) == allObjects.end()) {
313  throw vpException(vpException::badValue, "Object " + objectName + " is not known by the Megapose server!");
314  }
315  std::future<vpMegaPoseEstimate> trackerFuture;
317 
318  cv::Mat frame;
319  vpMegaPoseEstimate megaposeEstimate; // last Megapose estimation
320  vpRect lastDetection; // Last detection (initialization)
321  bool callMegapose = true; // Whether we should call Megapose this iteration
322  bool initialized = false; // Whether tracking should be initialized or reinitialized
323  bool tracking = false;
324 
325  bool overlayModel = true;
326  vpImage<vpRGBa> overlayImage(height, width);
327  std::string overlayMode = "full";
328 
329  std::vector<double> megaposeTimes;
330  std::vector<double> frameTimes;
331 
332  double megaposeStartTime = 0.0;
333 
335  while (true) {
336  const double frameStart = vpTime::measureTimeMs();
337  capture >> frame;
338  if (frame.empty())
339  break;
340 
341  if (I.getSize() == 0) {
342  vpImageConvert::convert(frame, I);
343  d.init(I);
344  vpDisplay::setTitle(I, "Megapose object pose estimation");
345  }
346  else {
347  vpImageConvert::convert(frame, I);
348  }
351  // Check whether Megapose is still running
353  if (!callMegapose && trackerFuture.wait_for(std::chrono::milliseconds(0)) == std::future_status::ready) {
354  megaposeEstimate = trackerFuture.get();
355  if (tracking) {
356  megaposeTimes.push_back(vpTime::measureTimeMs() - megaposeStartTime);
357  }
358  callMegapose = true;
359  tracking = true;
360 
361  if (overlayModel) {
362  overlayImage = megapose->viewObjects({ objectName }, { megaposeEstimate.cTo }, overlayMode);
363  }
364 
365  if (megaposeEstimate.score < reinitThreshold) { // If confidence is low, require a reinitialisation with 2D detection
366  initialized = false;
367  }
368  }
371  if (callMegapose) {
372  if (!initialized) {
373  tracking = false;
374  std::optional<vpRect> detection = std::nullopt;
375 #if (VISP_HAVE_OPENCV_VERSION >= 0x030403) && defined(HAVE_OPENCV_DNN) && \
376  ((__cplusplus >= 201703L) || (defined(_MSVC_LANG) && (_MSVC_LANG >= 201703L)))
377  if (detectionMethod == DetectionMethod::DNN) {
378  detection = detectObjectForInitMegaposeDnn(
379  dnn, frame, objectName, initialized ? std::optional(megaposeEstimate) : std::nullopt);
380  }
381 #endif
382  if (detectionMethod == DetectionMethod::CLICK) {
383  detection = detectObjectForInitMegaposeClick(I);
384  }
385 
386  if (detection) {
387  initialized = true;
388  lastDetection = *detection;
389  trackerFuture = megaposeTracker.init(I, lastDetection);
390  callMegapose = false;
391 
392  }
393  }
394  else {
395  trackerFuture = megaposeTracker.track(I);
396  callMegapose = false;
397  megaposeStartTime = vpTime::measureTimeMs();
398  }
399  }
401 
403  std::string keyboardEvent;
404  const bool keyPressed = vpDisplay::getKeyboardEvent(I, keyboardEvent, false);
405  if (keyPressed) {
406  if (keyboardEvent == "t") {
407  overlayModel = !overlayModel;
408  }
409  else if (keyboardEvent == "w") {
410  overlayMode = overlayMode == "full" ? "wireframe" : "full";
411  }
412  }
413 
414  if (tracking) {
415  if (overlayModel) {
416  overlayRender(I, overlayImage);
418  }
419  vpDisplay::displayText(I, 20, 20, "Right click to quit", vpColor::red);
420  vpDisplay::displayText(I, 30, 20, "Press T: Toggle overlay", vpColor::red);
421  vpDisplay::displayText(I, 40, 20, "Press W: Toggle wireframe", vpColor::red);
422  vpDisplay::displayFrame(I, megaposeEstimate.cTo, cam, 0.05, vpColor::none, 3);
423  //vpDisplay::displayRectangle(I, lastDetection, vpColor::red);
424  displayScore(I, megaposeEstimate.score);
425  }
427 
428  vpDisplay::flush(I);
429 
431  if (vpDisplay::getClick(I, button, false)) {
432  if (button == vpMouseButton::button3) {
433  break; // Right click to stop
434  }
435  }
436  const double frameEnd = vpTime::measureTimeMs();
437  if (!isLiveCapture) {
438  vpTime::wait(std::max<double>(0.0, videoFrametime - (frameEnd - frameStart)));
439  }
440  frameTimes.push_back(vpTime::measureTimeMs() - frameStart);
441  }
442  std::cout << "Average frame time: " << vpMath::getMean(frameTimes) << std::endl;
443  std::cout << "Average time between Megapose calls: " << vpMath::getMean(megaposeTimes) << std::endl;
444 }
445 
446 #else
447 int main()
448 {
449  std::cout << "Compile ViSP with the DNN tracker module, the JSON 3rd party library and the OpenCV detection module" << std::endl;
450  return EXIT_SUCCESS;
451 }
452 
453 #endif
Generic class defining intrinsic camera parameters.
@ perspectiveProjWithoutDistortion
Perspective projection without distortion model.
vpCameraParametersProjType get_projModel() const
Class to define RGB colors available for display functionalities.
Definition: vpColor.h:152
static const vpColor red
Definition: vpColor.h:211
static const vpColor none
Definition: vpColor.h:223
static const vpColor green
Definition: vpColor.h:214
Display for windows using GDI (available on any windows 32 platform).
Definition: vpDisplayGDI.h:128
The vpDisplayOpenCV allows to display image using the OpenCV library. Thus to enable this class OpenC...
Use the X11 console to display images on unix-like OS. Thus to enable this class X11 should be instal...
Definition: vpDisplayX.h:128
void init(vpImage< unsigned char > &I, int win_x=-1, int win_y=-1, const std::string &win_title="") vp_override
static bool getClick(const vpImage< unsigned char > &I, bool blocking=true)
static bool getKeyboardEvent(const vpImage< unsigned char > &I, bool blocking=true)
static void display(const vpImage< unsigned char > &I)
static void displayFrame(const vpImage< unsigned char > &I, const vpHomogeneousMatrix &cMo, const vpCameraParameters &cam, double size, const vpColor &color=vpColor::none, unsigned int thickness=1, const vpImagePoint &offset=vpImagePoint(0, 0), const std::string &frameName="", const vpColor &textColor=vpColor::black, const vpImagePoint &textOffset=vpImagePoint(15, 15))
static void displayCross(const vpImage< unsigned char > &I, const vpImagePoint &ip, unsigned int size, const vpColor &color, unsigned int thickness=1)
static void setTitle(const vpImage< unsigned char > &I, const std::string &windowtitle)
static void flush(const vpImage< unsigned char > &I)
static void displayRectangle(const vpImage< unsigned char > &I, const vpImagePoint &topLeft, unsigned int width, unsigned int height, const vpColor &color, bool fill=false, unsigned int thickness=1)
static void displayText(const vpImage< unsigned char > &I, const vpImagePoint &ip, const std::string &s, const vpColor &color)
error that can be emitted by ViSP classes.
Definition: vpException.h:59
@ ioError
I/O error.
Definition: vpException.h:79
@ badValue
Used to indicate that a value is not in the allowed range.
Definition: vpException.h:85
static void convert(const vpImage< unsigned char > &src, vpImage< vpRGBa > &dest)
Class that defines a 2D point in an image. This class is useful for image processing and stores only ...
Definition: vpImagePoint.h:82
static double distance(const vpImagePoint &iP1, const vpImagePoint &iP2)
unsigned int getWidth() const
Definition: vpImage.h:245
unsigned int getSize() const
Definition: vpImage.h:224
unsigned int getHeight() const
Definition: vpImage.h:184
Command line argument parsing with support for JSON files. If a JSON file is supplied,...
static double getMean(const std::vector< double > &v)
Definition: vpMath.cpp:303
static bool isNumber(const std::string &str)
Definition: vpMath.cpp:215
vpHomogeneousMatrix cTo
Definition: vpMegaPose.h:69
A simplified interface to track a single object with MegaPose. This tracker works asynchronously: A c...
Definition: vpRGBa.h:61
unsigned char B
Blue component.
Definition: vpRGBa.h:139
unsigned char R
Red component.
Definition: vpRGBa.h:137
unsigned char G
Green component.
Definition: vpRGBa.h:138
Defines a rectangle in the plane.
Definition: vpRect.h:76
void getCenter(double &x, double &y) const
Definition: vpRect.h:133
VISP_EXPORT int wait(double t0, double t)
VISP_EXPORT double measureTimeMs()