doxygen/visp-daily/vpDetectorDNN_8cpp_source.html

 /****************************************************************************

  *

  * ViSP, open source Visual Servoing Platform software.

  * Copyright (C) 2005 - 2019 by Inria. All rights reserved.

  *

  * This software is free software; you can redistribute it and/or modify

  * it under the terms of the GNU General Public License as published by

  * the Free Software Foundation; either version 2 of the License, or

  * (at your option) any later version.

  * See the file LICENSE.txt at the root directory of this source

  * distribution for additional information about the GNU GPL.

  *

  * For using ViSP with software that can not be combined with the GNU

  * GPL, please contact Inria about acquiring a ViSP Professional

  * Edition License.

  *

  * See http://visp.inria.fr for more information.

  *

  * This software was developed at:

  * Inria Rennes - Bretagne Atlantique

  * Campus Universitaire de Beaulieu

  * 35042 Rennes Cedex

  * France

  *

  * If you have questions regarding the use of this file, please contact

  * Inria at visp@inria.fr

  *

  * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE

  * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.

  *

  * Description:

  * DNN object detection using OpenCV DNN module.

  *

  *****************************************************************************/

 #include <visp3/core/vpConfig.h>


 #if (VISP_HAVE_OPENCV_VERSION >= 0x030403) && defined(VISP_HAVE_OPENCV_DNN)

 #include <visp3/core/vpImageConvert.h>

 #include <visp3/detection/vpDetectorDNN.h>


 vpDetectorDNN::vpDetectorDNN()

   : m_blob(), m_boxes(), m_classIds(), m_confidences(), m_confidenceThreshold(0.5), m_I_color(), m_img(),

     m_inputSize(300, 300), m_mean(127.5, 127.5, 127.5), m_net(), m_nmsThreshold(0.4f), m_outNames(), m_outs(),

     m_scaleFactor(2.0 / 255.0), m_swapRB(true)

 {

 }


 vpDetectorDNN::~vpDetectorDNN() {}


 bool vpDetectorDNN::detect(const vpImage<unsigned char> &I)

 {

   vpImageConvert::convert(I, m_I_color);


   std::vector<vpRect> boundingBoxes;

   return detect(m_I_color, boundingBoxes);

 }


 bool vpDetectorDNN::detect(const vpImage<vpRGBa> &I, std::vector<vpRect> &boundingBoxes)

 {

   vpImageConvert::convert(I, m_img);


   cv::Size inputSize(m_inputSize.width > 0 ? m_inputSize.width : m_img.cols,

                      m_inputSize.height > 0 ? m_inputSize.height : m_img.rows);

   cv::dnn::blobFromImage(m_img, m_blob, m_scaleFactor, inputSize, m_mean, m_swapRB, false);


   m_net.setInput(m_blob);

   m_net.forward(m_outs, m_outNames);


   postProcess();


   boundingBoxes.resize(m_boxesNMS.size());

   for (size_t i = 0; i < m_boxesNMS.size(); i++) {

     cv::Rect box = m_boxesNMS[i];

     boundingBoxes[i] = vpRect(box.x, box.y, box.width, box.height);

   }


   m_nb_objects = boundingBoxes.size();

   m_polygon.resize(boundingBoxes.size());

   m_message.resize(boundingBoxes.size());

   for (size_t i = 0; i < boundingBoxes.size(); i++) {

     std::vector<vpImagePoint> polygon;


     double x = boundingBoxes[i].getLeft();

     double y = boundingBoxes[i].getTop();

     double w = boundingBoxes[i].getWidth();

     double h = boundingBoxes[i].getHeight();


     polygon.push_back(vpImagePoint(y, x));

     polygon.push_back(vpImagePoint(y + h, x));

     polygon.push_back(vpImagePoint(y + h, x + w));

     polygon.push_back(vpImagePoint(y, x + w));


     m_polygon[i] = polygon;


     std::ostringstream oss;

     oss << m_classIds[i] << " ; " << m_confidences[i] << " ; " << m_boxes[i];

     m_message[i] = oss.str();

   }


   return !boundingBoxes.empty();

 }


 std::vector<vpRect> vpDetectorDNN::getDetectionBBs(bool afterNMS) const

 {

   std::vector<vpRect> bbs;

   if (afterNMS) {

     bbs.reserve(m_boxesNMS.size());

     for (size_t i = 0; i < m_boxesNMS.size(); i++) {

       cv::Rect box = m_boxes[i];

       bbs.push_back(vpRect(box.x, box.y, box.width, box.height));

     }

   } else {

     bbs.reserve(m_boxes.size());

     for (size_t i = 0; i < m_boxes.size(); i++) {

       cv::Rect box = m_boxes[i];

       bbs.push_back(vpRect(box.x, box.y, box.width, box.height));

     }

   }


   return bbs;

 }


 std::vector<int> vpDetectorDNN::getDetectionClassIds(bool afterNMS) const

 {

   if (afterNMS) {

     std::vector<int> classIds;

     for (size_t i = 0; i < m_indices.size(); i++) {

       int idx = m_indices[i];

       classIds.push_back(m_classIds[idx]);

     }

     return classIds;

   }


   return m_classIds;

 }


 std::vector<float> vpDetectorDNN::getDetectionConfidence(bool afterNMS) const

 {

   if (afterNMS) {

     std::vector<float> confidences;

     for (size_t i = 0; i < m_indices.size(); i++) {

       int idx = m_indices[i];

       confidences.push_back(m_confidences[idx]);

     }

     return confidences;

   }


   return m_confidences;

 }


 #if (VISP_HAVE_OPENCV_VERSION == 0x030403)

 std::vector<cv::String> vpDetectorDNN::getOutputsNames()

 {

   static std::vector<cv::String> names;

   if (names.empty()) {

     std::vector<int> outLayers = m_net.getUnconnectedOutLayers();

     std::vector<cv::String> layersNames = m_net.getLayerNames();

     names.resize(outLayers.size());

     for (size_t i = 0; i < outLayers.size(); ++i)

       names[i] = layersNames[outLayers[i] - 1];

   }

   return names;

 }

 #endif


 void vpDetectorDNN::postProcess()

 {

   // Direct copy from object_detection.cpp OpenCV sample

   static std::vector<int> outLayers = m_net.getUnconnectedOutLayers();

   static std::string outLayerType = m_net.getLayer(outLayers[0])->type;


   m_classIds.clear();

   m_confidences.clear();

   m_boxes.clear();

   if (m_net.getLayer(0)->outputNameToIndex("im_info") != -1) // Faster-RCNN or R-FCN

   {

     // Network produces output blob with a shape 1x1xNx7 where N is a number of

     // detections and an every detection is a vector of values

     // [batchId, classId, confidence, left, top, right, bottom]

     CV_Assert(m_outs.size() == 1);

     float *data = (float *)m_outs[0].data;

     for (size_t i = 0; i < m_outs[0].total(); i += 7) {

       float confidence = data[i + 2];

       if (confidence > m_confidenceThreshold) {

         int left = (int)data[i + 3];

         int top = (int)data[i + 4];

         int right = (int)data[i + 5];

         int bottom = (int)data[i + 6];

         int width = right - left + 1;

         int height = bottom - top + 1;

         m_classIds.push_back((int)(data[i + 1]) - 1); // Skip 0th background class id.

         m_boxes.push_back(cv::Rect(left, top, width, height));

         m_confidences.push_back(confidence);

       }

     }

   } else if (outLayerType == "DetectionOutput") {

     // Network produces output blob with a shape 1x1xNx7 where N is a number of

     // detections and an every detection is a vector of values

     // [batchId, classId, confidence, left, top, right, bottom]

     CV_Assert(m_outs.size() == 1);

     float *data = (float *)m_outs[0].data;

     for (size_t i = 0; i < m_outs[0].total(); i += 7) {

       float confidence = data[i + 2];

       if (confidence > m_confidenceThreshold) {

         int left = (int)(data[i + 3] * m_img.cols);

         int top = (int)(data[i + 4] * m_img.rows);

         int right = (int)(data[i + 5] * m_img.cols);

         int bottom = (int)(data[i + 6] * m_img.rows);

         int width = right - left + 1;

         int height = bottom - top + 1;

         m_classIds.push_back((int)(data[i + 1]) - 1); // Skip 0th background class id.

         m_boxes.push_back(cv::Rect(left, top, width, height));

         m_confidences.push_back(confidence);

       }

     }

   } else if (outLayerType == "Region") {

     for (size_t i = 0; i < m_outs.size(); ++i) {

       // Network produces output blob with a shape NxC where N is a number of

       // detected objects and C is a number of classes + 4 where the first 4

       // numbers are [center_x, center_y, width, height]

       float *data = (float *)m_outs[i].data;

       for (int j = 0; j < m_outs[i].rows; ++j, data += m_outs[i].cols) {

         cv::Mat scores = m_outs[i].row(j).colRange(5, m_outs[i].cols);

         cv::Point classIdPoint;

         double confidence;

         cv::minMaxLoc(scores, 0, &confidence, 0, &classIdPoint);

         if (confidence > m_confidenceThreshold) {

           int centerX = (int)(data[0] * m_img.cols);

           int centerY = (int)(data[1] * m_img.rows);

           int width = (int)(data[2] * m_img.cols);

           int height = (int)(data[3] * m_img.rows);

           int left = centerX - width / 2;

           int top = centerY - height / 2;


           m_classIds.push_back(classIdPoint.x);

           m_confidences.push_back((float)confidence);

           m_boxes.push_back(cv::Rect(left, top, width, height));

         }

       }

     }

   } else if (outLayerType == "Identity" || outLayerType == "Softmax") {

     // In OpenCV 4.5.2, the output of ssd-mobilenet.onnx is parsed as `Softmax`, whereas

     // in OpenCV 4.5.5, the output is of type `Identity`, and the output order is permuted.


     // Network produces 2 outputs blob:

     // - `scores` with dimensions 1xNxC

     // - 'boxes'  with dimensions 1xNx4

     // where `N` is a number of detections and `C` is the number of classes (with `BACKGROUND` as classId = 0).


     int scores_index = m_outNames[0] == "scores" ? 0 : 1; // scores output index.

     int boxes_index = m_outNames[0] == "boxes" ? 0 : 1;   // boxes output index.


     int N = m_outs[scores_index].size[1], C = m_outs[scores_index].size[2];


     float *confidence = (float *)m_outs[scores_index].data;

     float *bbox = (float *)m_outs[boxes_index].data;


     // Loop over all guesses on the output of the network.

     for (int i = 0; i < N; i++) {

       uint32_t maxClass = 0;

       float maxScore = -1000.0f;


       for (int j = 1; j < C; j++) // ignore background (classId = 0).

       {

         const float score = confidence[i * C + j];


         if (score < m_confidenceThreshold)

           continue;


         if (score > maxScore) {

           maxScore = score;

           maxClass = j;

         }

       }


       if (maxScore > m_confidenceThreshold) {

         int left = (int)(bbox[4 * i] * m_img.cols);

         int top = (int)(bbox[4 * i + 1] * m_img.rows);

         int right = (int)(bbox[4 * i + 2] * m_img.cols);

         int bottom = (int)(bbox[4 * i + 3] * m_img.rows);

         int width = right - left + 1;

         int height = bottom - top + 1;


         m_boxes.push_back(cv::Rect(left, top, width, height));

         m_classIds.push_back(maxClass);

         m_confidences.push_back(maxScore);

       }

     }

   } else

     CV_Error(cv::Error::StsNotImplemented, "Unknown output layer type: " + outLayerType);


   cv::dnn::NMSBoxes(m_boxes, m_confidences, m_confidenceThreshold, m_nmsThreshold, m_indices);

   m_boxesNMS.resize(m_indices.size());

   for (size_t i = 0; i < m_indices.size(); ++i) {

     int idx = m_indices[i];

     m_boxesNMS[i] = m_boxes[idx];

   }

 }


 void vpDetectorDNN::readNet(const std::string &model, const std::string &config, const std::string &framework)

 {

   m_net = cv::dnn::readNet(model, config, framework);

 #if (VISP_HAVE_OPENCV_VERSION == 0x030403)

   m_outNames = getOutputsNames();

 #else

   m_outNames = m_net.getUnconnectedOutLayersNames();

 #endif

 }


 void vpDetectorDNN::setConfidenceThreshold(float confThreshold) { m_confidenceThreshold = confThreshold; }


 void vpDetectorDNN::setInputSize(int width, int height)

 {

   m_inputSize.width = width;

   m_inputSize.height = height;

 }


 void vpDetectorDNN::setMean(double meanR, double meanG, double meanB) { m_mean = cv::Scalar(meanR, meanG, meanB); }


 void vpDetectorDNN::setNMSThreshold(float nmsThreshold) { m_nmsThreshold = nmsThreshold; }


 void vpDetectorDNN::setPreferableBackend(int backendId) { m_net.setPreferableBackend(backendId); }


 void vpDetectorDNN::setPreferableTarget(int targetId) { m_net.setPreferableTarget(targetId); }


 void vpDetectorDNN::setScaleFactor(double scaleFactor) { m_scaleFactor = scaleFactor; }


 void vpDetectorDNN::setSwapRB(bool swapRB) { m_swapRB = swapRB; }


 #elif !defined(VISP_BUILD_SHARED_LIBS)

 // Work around to avoid warning: libvisp_core.a(vpDetectorDNN.cpp.o) has no

 // symbols

 void dummy_vpDetectorDNN(){};

 #endif

vpDetectorBase::m_message
std::vector< std::string > m_message
Message attached to each object.
Definition: vpDetectorBase.h:68

vpDetectorBase::m_polygon
std::vector< std::vector< vpImagePoint > > m_polygon
For each object, defines the polygon that contains the object.
Definition: vpDetectorBase.h:67

vpDetectorBase::m_nb_objects
size_t m_nb_objects
Number of detected objects.
Definition: vpDetectorBase.h:69

vpDetectorDNN::vpDetectorDNN
vpDetectorDNN()
Definition: vpDetectorDNN.cpp:41

vpDetectorDNN::setPreferableBackend
void setPreferableBackend(int backendId)
Definition: vpDetectorDNN.cpp:399

vpDetectorDNN::getDetectionBBs
std::vector< vpRect > getDetectionBBs(bool afterNMS=true) const
Definition: vpDetectorDNN.cpp:123

vpDetectorDNN::getDetectionConfidence
std::vector< float > getDetectionConfidence(bool afterNMS=true) const
Definition: vpDetectorDNN.cpp:165

vpDetectorDNN::setMean
void setMean(double meanR, double meanG, double meanB)
Definition: vpDetectorDNN.cpp:383

vpDetectorDNN::setPreferableTarget
void setPreferableTarget(int targetId)
Definition: vpDetectorDNN.cpp:407

vpDetectorDNN::setInputSize
void setInputSize(int width, int height)
Definition: vpDetectorDNN.cpp:370

vpDetectorDNN::setScaleFactor
void setScaleFactor(double scaleFactor)
Definition: vpDetectorDNN.cpp:412

vpDetectorDNN::readNet
void readNet(const std::string &model, const std::string &config="", const std::string &framework="")
Definition: vpDetectorDNN.cpp:347

vpDetectorDNN::~vpDetectorDNN
virtual ~vpDetectorDNN()
Definition: vpDetectorDNN.cpp:48

vpDetectorDNN::setNMSThreshold
void setNMSThreshold(float nmsThreshold)
Definition: vpDetectorDNN.cpp:391

vpDetectorDNN::setConfidenceThreshold
void setConfidenceThreshold(float confThreshold)
Definition: vpDetectorDNN.cpp:362

vpDetectorDNN::setSwapRB
void setSwapRB(bool swapRB)
Definition: vpDetectorDNN.cpp:419

vpDetectorDNN::getDetectionClassIds
std::vector< int > getDetectionClassIds(bool afterNMS=true) const
Definition: vpDetectorDNN.cpp:148

vpDetectorDNN::detect
virtual bool detect(const vpImage< unsigned char > &I)
Definition: vpDetectorDNN.cpp:58

vpImageConvert::convert
static void convert(const vpImage< unsigned char > &src, vpImage< vpRGBa > &dest)
Definition: vpImageConvert.cpp:68

vpImagePoint
Class that defines a 2D point in an image. This class is useful for image processing and stores only ...
Definition: vpImagePoint.h:89

vpImage< unsigned char >

vpRect
Defines a rectangle in the plane.
Definition: vpRect.h:80