4 #include <visp3/core/vpConfig.h>
7 #if ((__cplusplus >= 201703L) || (defined(_MSVC_LANG) && (_MSVC_LANG >= 201703L))) && \
8 defined(VISP_HAVE_NLOHMANN_JSON) && defined(VISP_HAVE_OPENCV) && defined(HAVE_OPENCV_VIDEOIO) && \
9 defined(HAVE_OPENCV_DNN) && (defined(VISP_HAVE_X11) || defined(VISP_HAVE_GDI) || defined(HAVE_OPENCV_HIGHGUI)) && \
10 defined(VISP_HAVE_THREADS)
14 #include <visp3/core/vpIoTools.h>
15 #include <visp3/detection/vpDetectorDNNOpenCV.h>
16 #include <visp3/gui/vpDisplayGDI.h>
17 #include <visp3/gui/vpDisplayOpenCV.h>
18 #include <visp3/gui/vpDisplayX.h>
19 #include <visp3/dnn_tracker/vpMegaPose.h>
20 #include <visp3/dnn_tracker/vpMegaPoseTracker.h>
21 #include <visp3/io/vpJsonArgumentParser.h>
23 #include <nlohmann/json.hpp>
25 #include <opencv2/videoio.hpp>
28 using json = nlohmann::json;
40 const float r = ((float)high.
R - (
float)low.
R) * f;
41 const float g = ((float)high.
G - (
float)low.
G) * f;
42 const float b = ((float)high.
B - (
float)low.
B) * f;
43 return vpColor((
unsigned char)r, (
unsigned char)g, (
unsigned char)b);
56 const unsigned top =
static_cast<unsigned>(I.
getHeight() * 0.85f);
57 const unsigned height =
static_cast<unsigned>(I.
getHeight() * 0.1f);
58 const unsigned left =
static_cast<unsigned>(I.
getWidth() * 0.05f);
59 const unsigned width =
static_cast<unsigned>(I.
getWidth() * 0.5f);
60 vpRect full(left, top, width, height);
61 vpRect scoreRect(left, top, width * score, height);
64 const vpColor c = interpolate(low, high, score);
78 for (
unsigned int i = 0; i < I.
getHeight(); ++i) {
79 for (
unsigned int j = 0; j < I.
getWidth(); ++j) {
80 if (overlay[i][j] != black) {
81 I[i][j] = overlay[i][j];
95 std::optional<vpRect> detectObjectForInitMegaposeDnn(vpDetectorDNNOpenCV &detector,
const cv::Mat &I,
96 const std::string &detectionLabel,
97 std::optional<vpMegaPoseEstimate> previousEstimate)
99 std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D> detections_vec;
100 detector.detect(I, detections_vec);
101 std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D> matchingDetections;
102 for (
const auto &detection : detections_vec) {
103 std::optional<std::string> classnameOpt = detection.getClassName();
105 if (*classnameOpt == detectionLabel) {
106 matchingDetections.push_back(detection);
110 if (matchingDetections.size() == 0) {
113 else if (matchingDetections.size() == 1) {
114 return matchingDetections[0].getBoundingBox();
118 if (previousEstimate) {
120 double bestDist = 10000.f;
121 const vpImagePoint previousCenter = (*previousEstimate).boundingBox.getCenter();
122 for (
const auto &detection : matchingDetections) {
123 const vpRect detectionBB = detection.getBoundingBox();
126 if (matchDist < bestDist) {
127 bestDist = matchDist;
136 double highestConf = 0.0;
137 for (
const auto &detection : matchingDetections) {
138 const double conf = detection.getConfidenceScore();
139 if (conf > highestConf) {
141 best = detection.getBoundingBox();
153 std::optional<vpRect> detectObjectForInitMegaposeClick(
const vpImage<vpRGBa> &I)
159 if (startLabelling) {
169 vpRect bb(topLeft, bottomRight);
188 NLOHMANN_JSON_SERIALIZE_ENUM(DetectionMethod, {
195 int main(
int argc,
const char *argv[])
197 unsigned width = 640, height = 480;
199 std::string videoDevice =
"0";
200 std::string megaposeAddress =
"127.0.0.1";
201 unsigned megaposePort = 5555;
202 int refinerIterations = 1, coarseNumSamples = 576;
203 double reinitThreshold = 0.2;
205 DetectionMethod detectionMethod = DetectionMethod::UNKNOWN;
207 std::string detectorModelPath =
"path/to/model.onnx", detectorConfig =
"none";
208 std::string detectorFramework =
"onnx", detectorTypeString =
"yolov7";
209 std::string objectName =
"cube";
210 std::vector<std::string> labels = {
"cube" };
211 float detectorMeanR = 0.f, detectorMeanG = 0.f, detectorMeanB = 0.f;
212 float detectorConfidenceThreshold = 0.65f, detectorNmsThreshold = 0.5f, detectorFilterThreshold = -0.25f;
213 float detectorScaleFactor = 0.0039f;
214 bool detectorSwapRB =
false;
217 parser.addArgument(
"width", width,
true,
"The image width")
218 .addArgument(
"height", height,
true,
"The image height")
219 .addArgument(
"camera", cam,
true,
"The camera intrinsic parameters. Should correspond to a perspective projection model without distortion.")
220 .addArgument(
"video-device", videoDevice,
true,
"Video device")
221 .addArgument(
"object", objectName,
true,
"Name of the object to track with megapose.")
222 .addArgument(
"detectionMethod", detectionMethod,
true,
"How to perform detection of the object to get the bounding box:"
223 " \"click\" for user labelling, \"dnn\" for dnn detection.")
224 .addArgument(
"reinitThreshold", reinitThreshold,
false,
"If the Megapose score falls below this threshold, then a reinitialization is be required."
225 " Should be between 0 and 1")
226 .addArgument(
"megapose/address", megaposeAddress,
true,
"IP address of the Megapose server.")
227 .addArgument(
"megapose/port", megaposePort,
true,
"Port on which the Megapose server listens for connections.")
228 .addArgument(
"megapose/refinerIterations", refinerIterations,
false,
"Number of Megapose refiner model iterations."
229 "A higher count may lead to better accuracy, at the cost of more processing time")
230 .addArgument(
"megapose/initialisationNumSamples", coarseNumSamples,
false,
"Number of Megapose renderings used for the initial pose estimation.")
232 .addArgument(
"detector/model-path", detectorModelPath,
true,
"Path to the model")
233 .addArgument(
"detector/config", detectorConfig,
true,
"Path to the model configuration. Set to none if config is not required.")
234 .addArgument(
"detector/framework", detectorFramework,
true,
"Detector framework")
235 .addArgument(
"detector/type", detectorTypeString,
true,
"Detector type")
236 .addArgument(
"detector/labels", labels,
true,
"Detection class labels")
237 .addArgument(
"detector/mean/red", detectorMeanR,
false,
"Detector mean red component. Used to normalize image")
238 .addArgument(
"detector/mean/green", detectorMeanG,
false,
"Detector mean green component. Used to normalize image")
239 .addArgument(
"detector/mean/blue", detectorMeanB,
false,
"Detector mean red component. Used to normalize image")
240 .addArgument(
"detector/confidenceThreshold", detectorConfidenceThreshold,
false,
"Detector confidence threshold. "
241 "When a detection with a confidence below this threshold, it is ignored")
242 .addArgument(
"detector/nmsThreshold", detectorNmsThreshold,
false,
"Detector non maximal suppression threshold.")
243 .addArgument(
"detector/filterThreshold", detectorFilterThreshold,
false)
244 .addArgument(
"detector/scaleFactor", detectorScaleFactor,
false,
"Pixel intensity rescaling factor. If set to 1/255, then pixel values are between 0 and 1.")
245 .addArgument(
"detector/swapRedAndBlue", detectorSwapRB,
false,
"Whether to swap red and blue channels before feeding the image to the detector.");
247 parser.parse(argc, argv);
254 if (detectionMethod == DetectionMethod::UNKNOWN) {
258 cv::VideoCapture capture;
260 bool hasCaptureOpeningSucceeded;
261 double videoFrametime = 0;
263 hasCaptureOpeningSucceeded = capture.open(std::atoi(videoDevice.c_str()));
264 isLiveCapture =
true;
267 hasCaptureOpeningSucceeded = capture.open(videoDevice);
268 isLiveCapture =
false;
269 double fps = capture.get(cv::CAP_PROP_FPS);
270 videoFrametime = (1.0 / fps) * 1000.0;
272 if (!hasCaptureOpeningSucceeded) {
273 std::cout <<
"Capture from camera: " << videoDevice <<
" didn't work" << std::endl;
278 #if defined(VISP_HAVE_X11)
280 #elif defined(VISP_HAVE_GDI)
282 #elif defined(HAVE_OPENCV_HIGHGUI)
286 #if (VISP_HAVE_OPENCV_VERSION >= 0x030403) && defined(HAVE_OPENCV_DNN) && \
287 ((__cplusplus >= 201703L) || (defined(_MSVC_LANG) && (_MSVC_LANG >= 201703L)))
288 vpDetectorDNNOpenCV::DNNResultsParsingType detectorType =
289 vpDetectorDNNOpenCV::dnnResultsParsingTypeFromString(detectorTypeString);
290 vpDetectorDNNOpenCV::NetConfig netConfig(detectorConfidenceThreshold, detectorNmsThreshold, labels,
291 cv::Size(width, height), detectorFilterThreshold);
292 vpDetectorDNNOpenCV dnn(netConfig, detectorType);
293 if (detectionMethod == DetectionMethod::DNN) {
294 dnn.readNet(detectorModelPath, detectorConfig, detectorFramework);
295 dnn.setMean(detectorMeanR, detectorMeanG, detectorMeanB);
296 dnn.setScaleFactor(detectorScaleFactor);
297 dnn.setSwapRB(detectorSwapRB);
301 std::shared_ptr<vpMegaPose> megapose;
303 megapose = std::make_shared<vpMegaPose>(megaposeAddress, megaposePort, cam, height, width);
310 megapose->setCoarseNumSamples(coarseNumSamples);
311 const std::vector<std::string> allObjects = megapose->getObjectNames();
312 if (std::find(allObjects.begin(), allObjects.end(), objectName) == allObjects.end()) {
315 std::future<vpMegaPoseEstimate> trackerFuture;
321 bool callMegapose =
true;
322 bool initialized =
false;
323 bool tracking =
false;
325 bool overlayModel =
true;
327 std::string overlayMode =
"full";
329 std::vector<double> megaposeTimes;
330 std::vector<double> frameTimes;
332 double megaposeStartTime = 0.0;
353 if (!callMegapose && trackerFuture.wait_for(std::chrono::milliseconds(0)) == std::future_status::ready) {
354 megaposeEstimate = trackerFuture.get();
362 overlayImage = megapose->viewObjects({ objectName }, { megaposeEstimate.
cTo }, overlayMode);
365 if (megaposeEstimate.
score < reinitThreshold) {
374 std::optional<vpRect> detection = std::nullopt;
375 #if (VISP_HAVE_OPENCV_VERSION >= 0x030403) && defined(HAVE_OPENCV_DNN) && \
376 ((__cplusplus >= 201703L) || (defined(_MSVC_LANG) && (_MSVC_LANG >= 201703L)))
377 if (detectionMethod == DetectionMethod::DNN) {
378 detection = detectObjectForInitMegaposeDnn(
379 dnn, frame, objectName, initialized ? std::optional(megaposeEstimate) : std::nullopt);
382 if (detectionMethod == DetectionMethod::CLICK) {
383 detection = detectObjectForInitMegaposeClick(I);
388 lastDetection = *detection;
389 trackerFuture = megaposeTracker.init(I, lastDetection);
390 callMegapose =
false;
395 trackerFuture = megaposeTracker.track(I);
396 callMegapose =
false;
403 std::string keyboardEvent;
406 if (keyboardEvent ==
"t") {
407 overlayModel = !overlayModel;
409 else if (keyboardEvent ==
"w") {
410 overlayMode = overlayMode ==
"full" ?
"wireframe" :
"full";
416 overlayRender(I, overlayImage);
424 displayScore(I, megaposeEstimate.
score);
437 if (!isLiveCapture) {
438 vpTime::wait(std::max<double>(0.0, videoFrametime - (frameEnd - frameStart)));
442 std::cout <<
"Average frame time: " <<
vpMath::getMean(frameTimes) << std::endl;
443 std::cout <<
"Average time between Megapose calls: " <<
vpMath::getMean(megaposeTimes) << std::endl;
449 std::cout <<
"Compile ViSP with the DNN tracker module, the JSON 3rd party library and the OpenCV detection module" << std::endl;
Generic class defining intrinsic camera parameters.
@ perspectiveProjWithoutDistortion
Perspective projection without distortion model.
vpCameraParametersProjType get_projModel() const
Class to define RGB colors available for display functionalities.
static const vpColor none
static const vpColor green
Display for windows using GDI (available on any windows 32 platform).
The vpDisplayOpenCV allows to display image using the OpenCV library. Thus to enable this class OpenC...
Use the X11 console to display images on unix-like OS. Thus to enable this class X11 should be instal...
void init(vpImage< unsigned char > &I, int win_x=-1, int win_y=-1, const std::string &win_title="") vp_override
static bool getClick(const vpImage< unsigned char > &I, bool blocking=true)
static bool getKeyboardEvent(const vpImage< unsigned char > &I, bool blocking=true)
static void display(const vpImage< unsigned char > &I)
static void displayFrame(const vpImage< unsigned char > &I, const vpHomogeneousMatrix &cMo, const vpCameraParameters &cam, double size, const vpColor &color=vpColor::none, unsigned int thickness=1, const vpImagePoint &offset=vpImagePoint(0, 0), const std::string &frameName="", const vpColor &textColor=vpColor::black, const vpImagePoint &textOffset=vpImagePoint(15, 15))
static void displayCross(const vpImage< unsigned char > &I, const vpImagePoint &ip, unsigned int size, const vpColor &color, unsigned int thickness=1)
static void setTitle(const vpImage< unsigned char > &I, const std::string &windowtitle)
static void flush(const vpImage< unsigned char > &I)
static void displayRectangle(const vpImage< unsigned char > &I, const vpImagePoint &topLeft, unsigned int width, unsigned int height, const vpColor &color, bool fill=false, unsigned int thickness=1)
static void displayText(const vpImage< unsigned char > &I, const vpImagePoint &ip, const std::string &s, const vpColor &color)
error that can be emitted by ViSP classes.
@ badValue
Used to indicate that a value is not in the allowed range.
static void convert(const vpImage< unsigned char > &src, vpImage< vpRGBa > &dest)
Class that defines a 2D point in an image. This class is useful for image processing and stores only ...
static double distance(const vpImagePoint &iP1, const vpImagePoint &iP2)
unsigned int getWidth() const
unsigned int getSize() const
unsigned int getHeight() const
Command line argument parsing with support for JSON files. If a JSON file is supplied,...
static double getMean(const std::vector< double > &v)
static bool isNumber(const std::string &str)
A simplified interface to track a single object with MegaPose. This tracker works asynchronously: A c...
unsigned char B
Blue component.
unsigned char R
Red component.
unsigned char G
Green component.
Defines a rectangle in the plane.
void getCenter(double &x, double &y) const
VISP_EXPORT int wait(double t0, double t)
VISP_EXPORT double measureTimeMs()