From ce7a1877895ea494ffb79b2e85e48ec6747954f9 Mon Sep 17 00:00:00 2001 From: Michael Welter Date: Thu, 9 May 2024 22:17:37 +0200 Subject: tracker/nn: Add support for 3x3 pos/size cov matrix (and some cleanup) (#1845) Adds support for 3x3 covariance matrices for position+size estimation. My recent (unreleased) models output those instead of only component wise standard deviations. Overall Accept 3x3 covariance matrix output for position+size Old models still worked fine in my testing Delete commented code which was there previously to support recurrent models. (Won't be used in foreseeable future) Delete leftover code for experiments with closed eyes detection (Likewise, I have given up on that / Is not really needed any more) Improve comments and code quality in the internal filter code --- tracker-neuralnet/deadzone_filter.cpp | 57 +++++++++------- tracker-neuralnet/model_adapters.cpp | 122 ++++++++++++---------------------- tracker-neuralnet/model_adapters.h | 10 ++- 3 files changed, 79 insertions(+), 110 deletions(-) diff --git a/tracker-neuralnet/deadzone_filter.cpp b/tracker-neuralnet/deadzone_filter.cpp index b41afdba..fa96eeb3 100644 --- a/tracker-neuralnet/deadzone_filter.cpp +++ b/tracker-neuralnet/deadzone_filter.cpp @@ -3,6 +3,7 @@ #include "opencv_contrib.h" #include "unscented_trafo.h" +#include #include #include #include @@ -12,19 +13,25 @@ namespace neuralnet_tracker_ns using namespace cvcontrib; -using StateVec = cv::Vec; -using StateCov = cv::Matx; +// Number of degrees of freedom of position and rotation +static constexpr int dofs = 6; -static constexpr int num_sigmas = ukf_cv::MerweScaledSigmaPoints<6>::num_sigmas; +using StateVec = cv::Vec; +using StateCov = cv::Matx; + +static constexpr int num_sigmas = ukf_cv::MerweScaledSigmaPoints::num_sigmas; +// Rescaling factor for position/size living in the space of the face crop. +// Applied prior to application of UKF to prevent numerical problems. static constexpr float img_scale = 200.f; +// Similar rescaling factor for position/size that live in world space. static constexpr float world_scale = 1000.f; // mm +// Fills the 6 DoF covariance factor, as in L L^T factorization. +// Covariance is given wrt the tangent space of current predictions StateCov make_tangent_space_uncertainty_tril(const PoseEstimator::Face &face) { StateCov tril = StateCov::eye(); - tril(0,0) = face.center_stddev.x/img_scale; - tril(1,1) = face.center_stddev.y/img_scale; - tril(2,2) = face.size_stddev/img_scale; + set_minor<3,3>(tril, 0, 0, face.center_size_cov_tril / img_scale); set_minor<3,3>(tril, 3, 3, face.rotaxis_cov_tril); return tril; } @@ -41,7 +48,7 @@ QuatPose apply_offset(const QuatPose& pose, const StateVec& offset) } -PoseEstimator::Face apply_offset(const PoseEstimator::Face& face, const StateVec& offset) +std::tuple apply_offset(const PoseEstimator::Face& face, const StateVec& offset) { const cv::Quatf dr = cv::Quatf::createFromRvec(cv::Vec3f{ offset[3], offset[4], offset[5] }); const auto r = face.rotation * dr; @@ -57,14 +64,10 @@ PoseEstimator::Face apply_offset(const PoseEstimator::Face& face, const StateVec // is designed to handle non-linearities like this. const float sz = std::max(0.1f*face.size, face.size + offset[2]*img_scale); - return PoseEstimator::Face{ + return { r, - {}, - {}, p, - {}, sz, - {} }; } @@ -77,9 +80,9 @@ StateVec relative_to(const QuatPose& reference, const QuatPose& pose) } -ukf_cv::SigmaPoints<6> relative_to(const QuatPose& pose, const std::array& sigmas) +ukf_cv::SigmaPoints relative_to(const QuatPose& pose, const std::array& sigmas) { - ukf_cv::SigmaPoints<6> out; + ukf_cv::SigmaPoints out; // Beware, the number of points is != the number of DoFs. std::transform(sigmas.begin(), sigmas.end(), out.begin(), [&pose](const QuatPose& s) { return relative_to(pose, s); }); @@ -87,14 +90,14 @@ ukf_cv::SigmaPoints<6> relative_to(const QuatPose& pose, const std::array compute_world_pose_from_sigma_point(const PoseEstimator::Face& face, const ukf_cv::SigmaPoints<6>& sigmas, Face2WorldFunction face2world) +std::array compute_world_pose_from_sigma_point(const PoseEstimator::Face& face, const ukf_cv::SigmaPoints& sigmas, Face2WorldFunction face2world) { std::array out; std::transform(sigmas.begin(), sigmas.end(), out.begin(), [face2world=std::move(face2world), &face](const StateVec& sigma_point) { // First unpack the state vector and generate quaternion rotation w.r.t image space. - const auto sigma_face = apply_offset(face, sigma_point); + const auto [rotation, center, size] = apply_offset(face, sigma_point); // Then transform ... - QuatPose pose = face2world(sigma_face.rotation, sigma_face.center, sigma_face.size); + QuatPose pose = face2world(rotation, center, size); pose.pos /= world_scale; return pose; }); @@ -131,24 +134,32 @@ StateVec apply_filter_to_offset(const StateVec& offset, const StateCov& offset_c QuatPose apply_filter(const PoseEstimator::Face &face, const QuatPose& previous_pose_, float dt, Face2WorldFunction face2world, const FiltParams& params) { - ukf_cv::MerweScaledSigmaPoints<6> unscentedtrafo; + ukf_cv::MerweScaledSigmaPoints unscentedtrafo; auto previous_pose = previous_pose_; previous_pose.pos /= world_scale; - // Here we have the covariance matrix for the offset from the observed values in `face`. + // Get 6 DoF covariance factor for the predictions in the face crop space. const auto cov_tril = make_tangent_space_uncertainty_tril(face); - // The filter uses an unscented transform to translate that into a distribution for the offset from the previous pose. - const ukf_cv::SigmaPoints<6> sigmas = unscentedtrafo.compute_sigmas(to_vec(StateVec::zeros()), cov_tril, true); + // Compute so called sigma points. These represent the distribution from the covariance matrix in terms of + // sampling points. + const ukf_cv::SigmaPoints sigmas = unscentedtrafo.compute_sigmas(to_vec(StateVec::zeros()), cov_tril, true); + // The filter uses an unscented transform to translate that into a distribution for the offset from the previous pose. + // The trick is to transform the sampling points and compute a covariance from them in the output space. // We have many of these sigma points. This is why that callback comes into play here. + // The transform to 3d world space is more than Face2WorldFunction because we also need to apply the sigma point (as + // a relative offset) to the pose in face crop space. const std::array pose_sigmas = compute_world_pose_from_sigma_point(face, sigmas, std::move(face2world)); - const ukf_cv::SigmaPoints<6> deltas_sigmas = relative_to(previous_pose, pose_sigmas); + // Compute sigma points relative to the previous pose + const ukf_cv::SigmaPoints deltas_sigmas = relative_to(previous_pose, pose_sigmas); + // Compute the mean offset from the last pose and the spread due to the networks uncertainty output. const auto [offset, offset_cov] = unscentedtrafo.compute_statistics(deltas_sigmas); - // Then the deadzone is applied to the offset and finally the previous pose is transformed by the offset to arrive at the final output. + // Then the deadzone is applied to the offset and finally the previous pose is transformed by the offset to arrive + // at the final output. const StateVec scaled_offset = apply_filter_to_offset(offset, offset_cov, dt, params); QuatPose new_pose = apply_offset(previous_pose, scaled_offset); diff --git a/tracker-neuralnet/model_adapters.cpp b/tracker-neuralnet/model_adapters.cpp index a8580a89..f53478af 100644 --- a/tracker-neuralnet/model_adapters.cpp +++ b/tracker-neuralnet/model_adapters.cpp @@ -192,11 +192,15 @@ PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&ses if (session_.GetOutputCount() < 2) throw std::runtime_error("Invalid Model: must have at least two outputs"); - // WARNING UB .. but still ... - // If the model was saved without meta data, it seems the version field is uninitialized. - // In that case reading from it is UB. However, in practice we will just get some arbitrary number - // which is hopefully different from the numbers used by models where the version is set. - if (model_version_ != 2 && model_version_ != 3) + // WARNING: Messy model compatibility issues! + // When reading the initial model release, it did not have the version field set. + // Reading it here will result in some unspecified value. It's probably UB due to + // reading uninitialized memory. But there is little choice. + // Now, detection of this old version is messy ... we have to guess based on the + // number we get. Getting an uninitialized value matching a valid version is unlikely. + // But the real problem is that this line must be updated whenever we want to bump the + // version number!! + if (model_version_ <= 0 || model_version_ > 4) model_version_ = 1; const cv::Size input_image_shape = get_input_image_shape(session_); @@ -224,10 +228,9 @@ PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&ses { "box", TensorSpec{ { 1, 4}, &output_box_[0], output_box_.rows } }, { "rotaxis_scales_tril", TensorSpec{ {1, 3, 3}, output_rotaxis_scales_tril_.val, 9 }}, { "rotaxis_std", TensorSpec{ {1, 3, 3}, output_rotaxis_scales_tril_.val, 9 }}, // TODO: Delete when old models aren't used any more - { "eyes", TensorSpec{ { 1, 2}, output_eyes_.val, output_eyes_.rows }}, - { "pos_size_std", TensorSpec{ {1, 3}, output_coord_scales_.val, output_coord_scales_.rows}}, - { "pos_size_scales", TensorSpec{ {1, 3}, output_coord_scales_.val, output_coord_scales_.rows}}, - //{ "box_std", TensorSpec{ {1, 4}, output_box_scales_.val, output_box_scales_ .rows}} + { "pos_size_std", TensorSpec{ {1, 3}, output_coord_scales_std_.val, output_coord_scales_std_.rows}}, + { "pos_size_scales", TensorSpec{ {1, 3}, output_coord_scales_std_.val, output_coord_scales_std_.rows}}, + { "pos_size_scales_tril", TensorSpec{ {1, 3, 3}, output_coord_scales_tril_.val, 9}} }; qDebug() << "Pose model inputs (" << session_.GetInputCount() << ")"; @@ -236,17 +239,17 @@ PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&ses output_c_names_.resize(session_.GetOutputCount()); for (size_t i=0; isecond; + TensorSpec& t = my_tensor_spec_it->second; if (onnx_tensor_spec.GetShape() != t.shape || onnx_tensor_spec.GetElementType() != Ort::TypeToTensorType::type) throw std::runtime_error("Invalid output tensor spec for "s + name); @@ -266,29 +269,9 @@ PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&ses has_uncertainty_ = understood_outputs.at("rotaxis_scales_tril").available || understood_outputs.at("rotaxis_std").available; has_uncertainty_ &= understood_outputs.at("pos_size_std").available || - understood_outputs.at("pos_size_scales").available; - //has_uncertainty_ &= understood_outputs.at("box_std").available; - has_eye_closed_detection_ = understood_outputs.at("eyes").available; - - // FIXME: Recurrent states - - // size_t num_regular_outputs = 2; - - // num_recurrent_states_ = session_.GetInputCount()-1; - // if (session_.GetOutputCount()-num_regular_outputs != num_recurrent_states_) - // throw std::runtime_error("Invalid Model: After regular inputs and outputs the model must have equal number of inputs and outputs for tensors holding hidden states of recurrent layers."); - - // // Create tensors for recurrent state - // for (size_t i = 0; i < num_recurrent_states_; ++i) - // { - // const auto& input_info = session_.GetInputTypeInfo(1+i); - // const auto& output_info = session_.GetOutputTypeInfo(num_regular_outputs+i); - // if (input_info.GetTensorTypeAndShapeInfo().GetShape() != - // output_info.GetTensorTypeAndShapeInfo().GetShape()) - // throw std::runtime_error("Invalid Model: Tensors for recurrent hidden states should have same shape on intput and output"); - // input_val_.push_back(create_tensor(input_info, allocator_)); - // output_val_.push_back(create_tensor(output_info, allocator_)); - // } + understood_outputs.at("pos_size_scales").available || + understood_outputs.at("pos_size_scales_tril").available; + pos_scale_uncertainty_is_matrix_ = understood_outputs.at("pos_size_scales_tril").available; input_names_.resize(session_.GetInputCount()); input_c_names_.resize(session_.GetInputCount()); @@ -348,38 +331,32 @@ std::optional PoseEstimator::run( return {}; } - for (size_t i = 0; i PoseEstimator::run( assert(output_rotaxis_scales_tril_(0, 1) == 0); assert(output_rotaxis_scales_tril_(0, 2) == 0); assert(output_rotaxis_scales_tril_(1, 2) == 0); + assert(center_size_cov_tril(0, 1) == 0); + assert(center_size_cov_tril(0, 2) == 0); + assert(center_size_cov_tril(1, 2) == 0); cv::Matx33f rotaxis_scales_tril = output_rotaxis_scales_tril_; @@ -407,29 +387,9 @@ std::optional PoseEstimator::run( 0.5f*patch_size*(output_box_[2]-output_box_[0]), 0.5f*patch_size*(output_box_[3]-output_box_[1]) }; - // const RoiCorners outbox = { - // patch_center + 0.5f*patch_size*cv::Point2f{output_box_[0], output_box_[1]}, - // patch_center + 0.5f*patch_size*cv::Point2f{output_box_[2], output_box_[3]} - // }; - // RoiCorners outbox_stddev = { - // 0.5f*patch_size*cv::Point2f{output_box_scales_[0], output_box_scales_[1]}, - // 0.5f*patch_size*cv::Point2f{output_box_scales_[2], output_box_scales_[3]} - // }; - - // Because the model is sensitive to closing eyes we increase the uncertainty - // a lot to make the subsequent filtering smooth the output more. This should suppress - // "twitching" when the user blinks. - if (has_eye_closed_detection_) - { - const float eye_open = std::min(output_eyes_[0], output_eyes_[1]); - const float increase_factor = 1.f + 10.f * std::pow(1. - eye_open,4.f); - rotaxis_scales_tril *= increase_factor; - size_stddev *= increase_factor; - center_stddev *= increase_factor; - } return std::optional({ - rotation, rotaxis_scales_tril, outbox, center, center_stddev, size, size_stddev + rotation, rotaxis_scales_tril, outbox, center, size, center_size_cov_tril }); } diff --git a/tracker-neuralnet/model_adapters.h b/tracker-neuralnet/model_adapters.h index 48f2fa2c..c1aaa6de 100644 --- a/tracker-neuralnet/model_adapters.h +++ b/tracker-neuralnet/model_adapters.h @@ -50,9 +50,8 @@ class PoseEstimator cv::Matx33f rotaxis_cov_tril; // Lower triangular factor of Cholesky decomposition cv::Rect2f box; cv::Point2f center; - cv::Point2f center_stddev; float size; - float size_stddev; + cv::Matx33f center_size_cov_tril; // Lower triangular factor of Cholesky decomposition }; PoseEstimator(Ort::MemoryInfo &allocator_info, @@ -84,16 +83,15 @@ class PoseEstimator cv::Vec output_quat_{}; // Quaternion output cv::Vec output_box_{}; // Bounding box output cv::Matx33f output_rotaxis_scales_tril_{}; // Lower triangular matrix of LLT factorization of covariance of rotation vector as offset from output quaternion - cv::Vec output_eyes_{}; - cv::Vec output_coord_scales_{}; + cv::Matx33f output_coord_scales_tril_{}; // Lower triangular factor + cv::Vec3f output_coord_scales_std_{}; // Depending on the model, alternatively a 3d vector with standard deviations. std::vector output_val_; // Tensors to put the model outputs in. std::vector output_names_; // Refers to the names in the onnx model. std::vector output_c_names_; // Refers to the C names in the onnx model. // More bookkeeping - size_t num_recurrent_states_ = 0; double last_inference_time_ = 0; bool has_uncertainty_ = false; - bool has_eye_closed_detection_ = false; + bool pos_scale_uncertainty_is_matrix_ = false; }; -- cgit v1.2.3