4 #include <visp3/core/vpConfig.h>
7 #if ((__cplusplus >= 201703L) || (defined(_MSVC_LANG) && (_MSVC_LANG >= 201703L))) && \
8 defined(VISP_HAVE_NLOHMANN_JSON) && defined(VISP_HAVE_OPENCV) && defined(HAVE_OPENCV_VIDEOIO) && \
9 defined(HAVE_OPENCV_DNN) && (defined(VISP_HAVE_X11) || defined(VISP_HAVE_GDI) || defined(HAVE_OPENCV_HIGHGUI)) && \
10 defined(VISP_HAVE_THREADS)
14 #include <visp3/core/vpIoTools.h>
15 #include <visp3/detection/vpDetectorDNNOpenCV.h>
16 #include <visp3/gui/vpDisplayGDI.h>
17 #include <visp3/gui/vpDisplayOpenCV.h>
18 #include <visp3/gui/vpDisplayX.h>
19 #include <visp3/dnn_tracker/vpMegaPose.h>
20 #include <visp3/dnn_tracker/vpMegaPoseTracker.h>
21 #include <visp3/io/vpJsonArgumentParser.h>
23 #include VISP_NLOHMANN_JSON(json.hpp)
25 #include <opencv2/videoio.hpp>
28 using json = nlohmann::json;
30 #ifdef ENABLE_VISP_NAMESPACE
44 const float r = ((float)high.
R - (
float)low.
R) * f;
45 const float g = ((float)high.
G - (
float)low.
G) * f;
46 const float b = ((float)high.
B - (
float)low.
B) * f;
47 return vpColor((
unsigned char)r, (
unsigned char)g, (
unsigned char)b);
60 const unsigned top =
static_cast<unsigned>(I.
getHeight() * 0.85f);
61 const unsigned height =
static_cast<unsigned>(I.
getHeight() * 0.1f);
62 const unsigned left =
static_cast<unsigned>(I.
getWidth() * 0.05f);
63 const unsigned width =
static_cast<unsigned>(I.
getWidth() * 0.5f);
64 vpRect full(left, top, width, height);
65 vpRect scoreRect(left, top, width * score, height);
68 const vpColor c = interpolate(low, high, score);
82 for (
unsigned int i = 0; i < I.
getHeight(); ++i) {
83 for (
unsigned int j = 0; j < I.
getWidth(); ++j) {
84 if (overlay[i][j] != black) {
85 I[i][j] = overlay[i][j];
99 std::optional<vpRect> detectObjectForInitMegaposeDnn(vpDetectorDNNOpenCV &detector,
const cv::Mat &I,
100 const std::string &detectionLabel,
101 std::optional<vpMegaPoseEstimate> previousEstimate)
103 std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D> detections_vec;
104 detector.detect(I, detections_vec);
105 std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D> matchingDetections;
106 for (
const auto &detection : detections_vec) {
107 std::optional<std::string> classnameOpt = detection.getClassName();
109 if (*classnameOpt == detectionLabel) {
110 matchingDetections.push_back(detection);
114 if (matchingDetections.size() == 0) {
117 else if (matchingDetections.size() == 1) {
118 return matchingDetections[0].getBoundingBox();
122 if (previousEstimate) {
124 double bestDist = 10000.f;
125 const vpImagePoint previousCenter = (*previousEstimate).boundingBox.getCenter();
126 for (
const auto &detection : matchingDetections) {
127 const vpRect detectionBB = detection.getBoundingBox();
130 if (matchDist < bestDist) {
131 bestDist = matchDist;
140 double highestConf = 0.0;
141 for (
const auto &detection : matchingDetections) {
142 const double conf = detection.getConfidenceScore();
143 if (conf > highestConf) {
145 best = detection.getBoundingBox();
157 std::optional<vpRect> detectObjectForInitMegaposeClick(
const vpImage<vpRGBa> &I)
163 if (startLabelling) {
173 vpRect bb(topLeft, bottomRight);
192 NLOHMANN_JSON_SERIALIZE_ENUM(DetectionMethod, {
199 int main(
int argc,
const char *argv[])
201 unsigned width = 640, height = 480;
203 std::string videoDevice =
"0";
204 std::string megaposeAddress =
"127.0.0.1";
205 unsigned megaposePort = 5555;
206 int refinerIterations = 1, coarseNumSamples = 576;
207 double reinitThreshold = 0.2;
209 DetectionMethod detectionMethod = DetectionMethod::UNKNOWN;
211 std::string detectorModelPath =
"path/to/model.onnx", detectorConfig =
"none";
212 std::string detectorFramework =
"onnx", detectorTypeString =
"yolov7";
213 std::string objectName =
"cube";
214 std::vector<std::string> labels = {
"cube" };
215 float detectorMeanR = 0.f, detectorMeanG = 0.f, detectorMeanB = 0.f;
216 float detectorConfidenceThreshold = 0.65f, detectorNmsThreshold = 0.5f, detectorFilterThreshold = -0.25f;
217 float detectorScaleFactor = 0.0039f;
218 bool detectorSwapRB =
false;
221 parser.addArgument(
"width", width,
true,
"The image width")
222 .addArgument(
"height", height,
true,
"The image height")
223 .addArgument(
"camera", cam,
true,
"The camera intrinsic parameters. Should correspond to a perspective projection model without distortion.")
224 .addArgument(
"video-device", videoDevice,
true,
"Video device")
225 .addArgument(
"object", objectName,
true,
"Name of the object to track with megapose.")
226 .addArgument(
"detectionMethod", detectionMethod,
true,
"How to perform detection of the object to get the bounding box:"
227 " \"click\" for user labelling, \"dnn\" for dnn detection.")
228 .addArgument(
"reinitThreshold", reinitThreshold,
false,
"If the Megapose score falls below this threshold, then a reinitialization is be required."
229 " Should be between 0 and 1")
230 .addArgument(
"megapose/address", megaposeAddress,
true,
"IP address of the Megapose server.")
231 .addArgument(
"megapose/port", megaposePort,
true,
"Port on which the Megapose server listens for connections.")
232 .addArgument(
"megapose/refinerIterations", refinerIterations,
false,
"Number of Megapose refiner model iterations."
233 "A higher count may lead to better accuracy, at the cost of more processing time")
234 .addArgument(
"megapose/initialisationNumSamples", coarseNumSamples,
false,
"Number of Megapose renderings used for the initial pose estimation.")
236 .addArgument(
"detector/model-path", detectorModelPath,
true,
"Path to the model")
237 .addArgument(
"detector/config", detectorConfig,
true,
"Path to the model configuration. Set to none if config is not required.")
238 .addArgument(
"detector/framework", detectorFramework,
true,
"Detector framework")
239 .addArgument(
"detector/type", detectorTypeString,
true,
"Detector type")
240 .addArgument(
"detector/labels", labels,
true,
"Detection class labels")
241 .addArgument(
"detector/mean/red", detectorMeanR,
false,
"Detector mean red component. Used to normalize image")
242 .addArgument(
"detector/mean/green", detectorMeanG,
false,
"Detector mean green component. Used to normalize image")
243 .addArgument(
"detector/mean/blue", detectorMeanB,
false,
"Detector mean red component. Used to normalize image")
244 .addArgument(
"detector/confidenceThreshold", detectorConfidenceThreshold,
false,
"Detector confidence threshold. "
245 "When a detection with a confidence below this threshold, it is ignored")
246 .addArgument(
"detector/nmsThreshold", detectorNmsThreshold,
false,
"Detector non maximal suppression threshold.")
247 .addArgument(
"detector/filterThreshold", detectorFilterThreshold,
false)
248 .addArgument(
"detector/scaleFactor", detectorScaleFactor,
false,
"Pixel intensity rescaling factor. If set to 1/255, then pixel values are between 0 and 1.")
249 .addArgument(
"detector/swapRedAndBlue", detectorSwapRB,
false,
"Whether to swap red and blue channels before feeding the image to the detector.");
251 parser.parse(argc, argv);
258 if (detectionMethod == DetectionMethod::UNKNOWN) {
262 cv::VideoCapture capture;
264 bool hasCaptureOpeningSucceeded;
265 double videoFrametime = 0;
267 hasCaptureOpeningSucceeded = capture.open(std::atoi(videoDevice.c_str()));
268 isLiveCapture =
true;
271 hasCaptureOpeningSucceeded = capture.open(videoDevice);
272 isLiveCapture =
false;
273 double fps = capture.get(cv::CAP_PROP_FPS);
274 videoFrametime = (1.0 / fps) * 1000.0;
276 if (!hasCaptureOpeningSucceeded) {
277 std::cout <<
"Capture from camera: " << videoDevice <<
" didn't work" << std::endl;
282 #if defined(VISP_HAVE_X11)
284 #elif defined(VISP_HAVE_GDI)
286 #elif defined(HAVE_OPENCV_HIGHGUI)
290 #if (VISP_HAVE_OPENCV_VERSION >= 0x030403) && defined(HAVE_OPENCV_DNN) && \
291 ((__cplusplus >= 201703L) || (defined(_MSVC_LANG) && (_MSVC_LANG >= 201703L)))
292 vpDetectorDNNOpenCV::DNNResultsParsingType detectorType =
293 vpDetectorDNNOpenCV::dnnResultsParsingTypeFromString(detectorTypeString);
294 vpDetectorDNNOpenCV::NetConfig netConfig(detectorConfidenceThreshold, detectorNmsThreshold, labels,
295 cv::Size(width, height), detectorFilterThreshold);
296 vpDetectorDNNOpenCV dnn(netConfig, detectorType);
297 if (detectionMethod == DetectionMethod::DNN) {
298 dnn.readNet(detectorModelPath, detectorConfig, detectorFramework);
299 dnn.setMean(detectorMeanR, detectorMeanG, detectorMeanB);
300 dnn.setScaleFactor(detectorScaleFactor);
301 dnn.setSwapRB(detectorSwapRB);
305 std::shared_ptr<vpMegaPose> megapose;
307 megapose = std::make_shared<vpMegaPose>(megaposeAddress, megaposePort, cam, height, width);
314 megapose->setCoarseNumSamples(coarseNumSamples);
315 const std::vector<std::string> allObjects = megapose->getObjectNames();
316 if (std::find(allObjects.begin(), allObjects.end(), objectName) == allObjects.end()) {
319 std::future<vpMegaPoseEstimate> trackerFuture;
325 bool callMegapose =
true;
326 bool initialized =
false;
327 bool tracking =
false;
329 bool overlayModel =
true;
331 std::string overlayMode =
"full";
333 std::vector<double> megaposeTimes;
334 std::vector<double> frameTimes;
336 double megaposeStartTime = 0.0;
357 if (!callMegapose && trackerFuture.wait_for(std::chrono::milliseconds(0)) == std::future_status::ready) {
358 megaposeEstimate = trackerFuture.get();
366 overlayImage = megapose->viewObjects({ objectName }, { megaposeEstimate.
cTo }, overlayMode);
369 if (megaposeEstimate.
score < reinitThreshold) {
378 std::optional<vpRect> detection = std::nullopt;
379 #if (VISP_HAVE_OPENCV_VERSION >= 0x030403) && defined(HAVE_OPENCV_DNN) && \
380 ((__cplusplus >= 201703L) || (defined(_MSVC_LANG) && (_MSVC_LANG >= 201703L)))
381 if (detectionMethod == DetectionMethod::DNN) {
382 detection = detectObjectForInitMegaposeDnn(
383 dnn, frame, objectName, initialized ? std::optional(megaposeEstimate) : std::nullopt);
386 if (detectionMethod == DetectionMethod::CLICK) {
387 detection = detectObjectForInitMegaposeClick(I);
392 lastDetection = *detection;
393 trackerFuture = megaposeTracker.init(I, lastDetection);
394 callMegapose =
false;
399 trackerFuture = megaposeTracker.track(I);
400 callMegapose =
false;
407 std::string keyboardEvent;
410 if (keyboardEvent ==
"t") {
411 overlayModel = !overlayModel;
413 else if (keyboardEvent ==
"w") {
414 overlayMode = overlayMode ==
"full" ?
"wireframe" :
"full";
420 overlayRender(I, overlayImage);
428 displayScore(I, megaposeEstimate.
score);
441 if (!isLiveCapture) {
442 vpTime::wait(std::max<double>(0.0, videoFrametime - (frameEnd - frameStart)));
446 std::cout <<
"Average frame time: " <<
vpMath::getMean(frameTimes) << std::endl;
447 std::cout <<
"Average time between Megapose calls: " <<
vpMath::getMean(megaposeTimes) << std::endl;
453 std::cout <<
"Compile ViSP with the DNN tracker module, the JSON 3rd party library and the OpenCV detection module" << std::endl;
Generic class defining intrinsic camera parameters.
@ perspectiveProjWithoutDistortion
Perspective projection without distortion model.
vpCameraParametersProjType get_projModel() const
Class to define RGB colors available for display functionalities.
static const vpColor none
static const vpColor green
Display for windows using GDI (available on any windows 32 platform).
The vpDisplayOpenCV allows to display image using the OpenCV library. Thus to enable this class OpenC...
void init(vpImage< unsigned char > &I, int winx=-1, int winy=-1, const std::string &title="") VP_OVERRIDE
static bool getClick(const vpImage< unsigned char > &I, bool blocking=true)
static bool getKeyboardEvent(const vpImage< unsigned char > &I, bool blocking=true)
static void display(const vpImage< unsigned char > &I)
static void displayFrame(const vpImage< unsigned char > &I, const vpHomogeneousMatrix &cMo, const vpCameraParameters &cam, double size, const vpColor &color=vpColor::none, unsigned int thickness=1, const vpImagePoint &offset=vpImagePoint(0, 0), const std::string &frameName="", const vpColor &textColor=vpColor::black, const vpImagePoint &textOffset=vpImagePoint(15, 15))
static void displayCross(const vpImage< unsigned char > &I, const vpImagePoint &ip, unsigned int size, const vpColor &color, unsigned int thickness=1)
static void setTitle(const vpImage< unsigned char > &I, const std::string &windowtitle)
static void flush(const vpImage< unsigned char > &I)
static void displayRectangle(const vpImage< unsigned char > &I, const vpImagePoint &topLeft, unsigned int width, unsigned int height, const vpColor &color, bool fill=false, unsigned int thickness=1)
static void displayText(const vpImage< unsigned char > &I, const vpImagePoint &ip, const std::string &s, const vpColor &color)
error that can be emitted by ViSP classes.
@ badValue
Used to indicate that a value is not in the allowed range.
static void convert(const vpImage< unsigned char > &src, vpImage< vpRGBa > &dest)
Class that defines a 2D point in an image. This class is useful for image processing and stores only ...
static double distance(const vpImagePoint &iP1, const vpImagePoint &iP2)
unsigned int getWidth() const
unsigned int getSize() const
unsigned int getHeight() const
Command line argument parsing with support for JSON files. If a JSON file is supplied,...
static double getMean(const std::vector< double > &v)
static bool isNumber(const std::string &str)
A simplified interface to track a single object with MegaPose. This tracker works asynchronously: A c...
unsigned char B
Blue component.
unsigned char R
Red component.
unsigned char G
Green component.
Defines a rectangle in the plane.
void getCenter(double &x, double &y) const
VISP_EXPORT int wait(double t0, double t)
VISP_EXPORT double measureTimeMs()