Pose estimation for augmented reality
Pose from Direct Linear Transform method

Table of Contents

Introduction

Although PnP is intrinsically a non-linear problem, a simple solution known as Direct Linear Transform (DLT) [4] [7] based on the resolution of a linear system can be considered.

Source code

The following source code that uses OpenCV is also available in pose-dlt-opencv.cpp file. It allows to compute the pose of the camera from points.

#include <iostream>
#include <opencv2/core/core.hpp>
#include <opencv2/calib3d/calib3d.hpp>
void pose_dlt(const std::vector< cv::Point3d > &wX, const std::vector< cv::Point2d > &x, cv::Mat &ctw, cv::Mat &cRw)
{
int npoints = (int)wX.size();
cv::Mat A(2*npoints, 12, CV_64F, cv::Scalar(0));
for(int i = 0; i < npoints; i++) { // Update matrix A using eq. 5
A.at<double>(2*i, 0) = wX[i].x;
A.at<double>(2*i, 1) = wX[i].y;
A.at<double>(2*i, 2) = wX[i].z;
A.at<double>(2*i, 3) = 1 ;
A.at<double>(2*i+1, 4) = wX[i].x;
A.at<double>(2*i+1, 5) = wX[i].y;
A.at<double>(2*i+1, 6) = wX[i].z;
A.at<double>(2*i+1, 7) = 1 ;
A.at<double>(2*i, 8) = -x[i].x * wX[i].x;
A.at<double>(2*i, 9) = -x[i].x * wX[i].y;
A.at<double>(2*i, 10) = -x[i].x * wX[i].z;
A.at<double>(2*i, 11) = -x[i].x;
A.at<double>(2*i+1, 8) = -x[i].y * wX[i].x;
A.at<double>(2*i+1, 9) = -x[i].y * wX[i].y;
A.at<double>(2*i+1, 10) = -x[i].y * wX[i].z;
A.at<double>(2*i+1, 11) = -x[i].y;
}
cv::Mat w, u, vt;
cv::SVD::compute(A, w, u, vt);
double smallestSv = w.at<double>(0, 0);
unsigned int indexSmallestSv = 0 ;
for (int i = 1; i < w.rows; i++) {
if ((w.at<double>(i, 0) < smallestSv) ) {
smallestSv = w.at<double>(i, 0);
indexSmallestSv = i;
}
}
cv::Mat h = vt.row(indexSmallestSv);
if (h.at<double>(0, 11) < 0) // tz < 0
h *=-1;
// Normalization to ensure that ||r3|| = 1
double norm = sqrt( h.at<double>(0,8)*h.at<double>(0,8)
+ h.at<double>(0,9)*h.at<double>(0,9)
+ h.at<double>(0,10)*h.at<double>(0,10) );
h /= norm;
for (int i = 0 ; i < 3 ; i++) {
ctw.at<double>(i,0) = h.at<double>(0, 4*i+3); // Translation
for (int j = 0 ; j < 3 ; j++)
cRw.at<double>(i,j) = h.at<double>(0, 4*i+j); // Rotation
}
}
int main()
{
std::vector< cv::Point3d > wX;
std::vector< cv::Point2d > x;
// Ground truth pose used to generate the data
cv::Mat ctw_truth = (cv::Mat_<double>(3,1) << -0.1, 0.1, 1.2); // Translation vector
cv::Mat crw_truth = (cv::Mat_<double>(3,1) << CV_PI/180*(5), CV_PI/180*(0), CV_PI/180*(45)); // Rotation vector
cv::Mat cRw_truth(3,3,cv::DataType<double>::type); // Rotation matrix
cv::Rodrigues(crw_truth, cRw_truth);
// Input data: 3D coordinates of at least 6 non coplanar points
double L = 0.2;
wX.push_back( cv::Point3d( -L, -L, 0 ) ); // wX_0 ( -L, -L, 0 )^T
wX.push_back( cv::Point3d( 2*L, -L, 0.2) ); // wX_1 (-2L, -L, 0.2)^T
wX.push_back( cv::Point3d( L, L, 0.2) ); // wX_2 ( L, L, 0.2)^T
wX.push_back( cv::Point3d( -L, L, 0 ) ); // wX_3 ( -L, L, 0 )^T
wX.push_back( cv::Point3d(-2*L, L, 0 ) ); // wX_4 (-2L, L, 0 )^T
wX.push_back( cv::Point3d( 0, 0, 0.5) ); // wX_5 ( 0, 0, 0.5)^T
// Input data: 2D coordinates of the points on the image plane
for(int i = 0; i < wX.size(); i++) {
cv::Mat cX = cRw_truth*cv::Mat(wX[i]) + ctw_truth; // Update cX, cY, cZ
x.push_back( cv::Point2d( cX.at<double>(0, 0)/cX.at<double>(2, 0),
cX.at<double>(1, 0)/cX.at<double>(2, 0) ) ); // x = (cX/cZ, cY/cZ)
}
cv::Mat ctw(3, 1, CV_64F); // Translation vector
cv::Mat cRw(3, 3, CV_64F); // Rotation matrix
pose_dlt(wX, x, ctw, cRw);
std::cout << "ctw (ground truth):\n" << ctw_truth << std::endl;
std::cout << "ctw (computed with DLT):\n" << ctw << std::endl;
std::cout << "cRw (ground truth):\n" << cRw_truth << std::endl;
std::cout << "cRw (computed with DLT):\n" << cRw << std::endl;
return 0;
}

Source code explained

First of all we include the header of the files that are requested to manipulate OpenCV vectors and matrices.

#include <iostream>
#include <opencv2/core/core.hpp>
#include <opencv2/calib3d/calib3d.hpp>

Then we introduce the function that does the pose estimation. It takes as input parameters ${^w}{\bf X} = (X,Y,Z,1)^T$ the 3D coordinates of the points in the world frame and ${\bf x} = (x,y,1)^T$ their normalized coordinates in the image plane. It returns the estimated pose in ctw for the translation vector and in cRw for the rotation matrix.

void pose_dlt(const std::vector< cv::Point3d > &wX, const std::vector< cv::Point2d > &x, cv::Mat &ctw, cv::Mat &cRw)

The implementation of the Direct Linear Transform algorithm is done next. First, for each point we update the values of matrix A using equation (5). Then we solve the system Ah=0 using a Singular Value Decomposition of A. Finaly, we determine the smallest eigen value that allows to identify the eigen vector that corresponds to the solution.

int npoints = (int)wX.size();
cv::Mat A(2*npoints, 12, CV_64F, cv::Scalar(0));
for(int i = 0; i < npoints; i++) { // Update matrix A using eq. 5
A.at<double>(2*i, 0) = wX[i].x;
A.at<double>(2*i, 1) = wX[i].y;
A.at<double>(2*i, 2) = wX[i].z;
A.at<double>(2*i, 3) = 1 ;
A.at<double>(2*i+1, 4) = wX[i].x;
A.at<double>(2*i+1, 5) = wX[i].y;
A.at<double>(2*i+1, 6) = wX[i].z;
A.at<double>(2*i+1, 7) = 1 ;
A.at<double>(2*i, 8) = -x[i].x * wX[i].x;
A.at<double>(2*i, 9) = -x[i].x * wX[i].y;
A.at<double>(2*i, 10) = -x[i].x * wX[i].z;
A.at<double>(2*i, 11) = -x[i].x;
A.at<double>(2*i+1, 8) = -x[i].y * wX[i].x;
A.at<double>(2*i+1, 9) = -x[i].y * wX[i].y;
A.at<double>(2*i+1, 10) = -x[i].y * wX[i].z;
A.at<double>(2*i+1, 11) = -x[i].y;
}
cv::Mat w, u, vt;
cv::SVD::compute(A, w, u, vt);
double smallestSv = w.at<double>(0, 0);
unsigned int indexSmallestSv = 0 ;
for (int i = 1; i < w.rows; i++) {
if ((w.at<double>(i, 0) < smallestSv) ) {
smallestSv = w.at<double>(i, 0);
indexSmallestSv = i;
}
}
cv::Mat h = vt.row(indexSmallestSv);
if (h.at<double>(0, 11) < 0) // tz < 0
h *=-1;
// Normalization to ensure that ||r3|| = 1
double norm = sqrt( h.at<double>(0,8)*h.at<double>(0,8)
+ h.at<double>(0,9)*h.at<double>(0,9)
+ h.at<double>(0,10)*h.at<double>(0,10) );
h /= norm;

From now the resulting eigen vector h that corresponds to the minimal eigen value of matrix A is used to update the translation vector ctw an the rotation matrix cRw.

for (int i = 0 ; i < 3 ; i++) {
ctw.at<double>(i,0) = h.at<double>(0, 4*i+3); // Translation
for (int j = 0 ; j < 3 ; j++)
cRw.at<double>(i,j) = h.at<double>(0, 4*i+j); // Rotation
}

Finally we define the main function in which we will initialize the input data before calling the previous function and computing the pose using DLT algorithm.

int main()

First in the main we create the data structures that will contain the 3D points coordinates wX in the world frame, their coordinates in the image plane x obtained after prerspective projection. Note here that at least 4 non coplanar points are requested to estimate the pose.

std::vector< cv::Point3d > wX;
std::vector< cv::Point2d > x;

For our simulation we then initialize the input data from a ground truth pose with the translation in ctw_truth and the rotation matrix in cRw_truth. For each point we set in wX[i] the 3D coordinates in the world frame (wX, wY, wZ, 1) and compute in cX their 3D coordinates (cX, cY, cZ, 1) in the camera frame. Then in x[i] we update their coordinates (x, y) in the image plane, obtained by perspective projection.

// Ground truth pose used to generate the data
cv::Mat ctw_truth = (cv::Mat_<double>(3,1) << -0.1, 0.1, 1.2); // Translation vector
cv::Mat crw_truth = (cv::Mat_<double>(3,1) << CV_PI/180*(5), CV_PI/180*(0), CV_PI/180*(45)); // Rotation vector
cv::Mat cRw_truth(3,3,cv::DataType<double>::type); // Rotation matrix
cv::Rodrigues(crw_truth, cRw_truth);
// Input data: 3D coordinates of at least 6 non coplanar points
double L = 0.2;
wX.push_back( cv::Point3d( -L, -L, 0 ) ); // wX_0 ( -L, -L, 0 )^T
wX.push_back( cv::Point3d( 2*L, -L, 0.2) ); // wX_1 (-2L, -L, 0.2)^T
wX.push_back( cv::Point3d( L, L, 0.2) ); // wX_2 ( L, L, 0.2)^T
wX.push_back( cv::Point3d( -L, L, 0 ) ); // wX_3 ( -L, L, 0 )^T
wX.push_back( cv::Point3d(-2*L, L, 0 ) ); // wX_4 (-2L, L, 0 )^T
wX.push_back( cv::Point3d( 0, 0, 0.5) ); // wX_5 ( 0, 0, 0.5)^T
// Input data: 2D coordinates of the points on the image plane
for(int i = 0; i < wX.size(); i++) {
cv::Mat cX = cRw_truth*cv::Mat(wX[i]) + ctw_truth; // Update cX, cY, cZ
x.push_back( cv::Point2d( cX.at<double>(0, 0)/cX.at<double>(2, 0),
cX.at<double>(1, 0)/cX.at<double>(2, 0) ) ); // x = (cX/cZ, cY/cZ)
}

From here we have initialized ${^w}{\bf X} = (X,Y,Z,1)^T$ and ${\bf x} = (x,y,1)^T$. We are now ready to call the function that does the pose estimation.

cv::Mat ctw(3, 1, CV_64F); // Translation vector
cv::Mat cRw(3, 3, CV_64F); // Rotation matrix
pose_dlt(wX, x, ctw, cRw);

Resulting pose estimation

If you run the previous code, it we produce the following result that shows that the estimated pose is equal to the ground truth one used to generate the input data:

ctw (ground truth):
[-0.1;
0.1;
1.2]
ctw (computed with DLT):
[-0.09999999999999899;
0.09999999999999894;
1.199999999999986]
cRw (ground truth):
[0.7072945483755065, -0.7061704379962989, 0.03252282795827704;
0.7061704379962989, 0.7036809008245869, -0.07846338199958876;
0.03252282795827704, 0.07846338199958876, 0.9963863524490802]
cRw (computed with DLT):
[0.7072945483754999, -0.7061704379962919, 0.03252282795827634;
0.7061704379962918, 0.70368090082458, -0.07846338199958748;
0.03252282795827235, 0.07846338199958545, 0.9963863524490808]