From ce7a1877895ea494ffb79b2e85e48ec6747954f9 Mon Sep 17 00:00:00 2001
From: Michael Welter <DaWelter@users.noreply.github.com>
Date: Thu, 9 May 2024 22:17:37 +0200
Subject: tracker/nn: Add support for 3x3 pos/size cov matrix (and some
 cleanup) (#1845)

Adds support for 3x3 covariance matrices for position+size estimation.
My recent (unreleased) models output those instead of only component wise standard deviations.

Overall

    Accept 3x3 covariance matrix output for position+size
    Old models still worked fine in my testing
    Delete commented code which was there previously to support recurrent models. (Won't be used in foreseeable future)
    Delete leftover code for experiments with closed eyes detection (Likewise, I have given up on that / Is not really needed any more)
    Improve comments and code quality in the internal filter code
---
 tracker-neuralnet/deadzone_filter.cpp |  57 +++++++++-------
 tracker-neuralnet/model_adapters.cpp  | 122 ++++++++++++----------------------
 tracker-neuralnet/model_adapters.h    |  10 ++-
 3 files changed, 79 insertions(+), 110 deletions(-)
diff --git a/tracker-neuralnet/deadzone_filter.cpp b/tracker-neuralnet/deadzone_filter.cpp
index b41afdba..fa96eeb3 100644
--- a/tracker-neuralnet/deadzone_filter.cpp
+++ b/tracker-neuralnet/deadzone_filter.cpp
@@ -3,6 +3,7 @@
 #include "opencv_contrib.h"
 #include "unscented_trafo.h"
 
+#include <tuple>
 #include <opencv2/core/base.hpp>
 #include <opencv2/core/matx.hpp>
 #include <opencv2/core/quaternion.hpp>
@@ -12,19 +13,25 @@ namespace neuralnet_tracker_ns
 
 using namespace cvcontrib;
 
-using StateVec = cv::Vec<float,6>;
-using StateCov = cv::Matx<float,6,6>;
+// Number of degrees of freedom of position and rotation
+static constexpr int dofs = 6;
 
-static constexpr int num_sigmas = ukf_cv::MerweScaledSigmaPoints<6>::num_sigmas;
+using StateVec = cv::Vec<float,dofs>;
+using StateCov = cv::Matx<float,dofs,dofs>;
+
+static constexpr int num_sigmas = ukf_cv::MerweScaledSigmaPoints<dofs>::num_sigmas;
+// Rescaling factor for position/size living in the space of the face crop.
+// Applied prior to application of UKF to prevent numerical problems.
 static constexpr float img_scale = 200.f;
+// Similar rescaling factor for position/size that live in world space.
 static constexpr float world_scale = 1000.f; // mm
 
+// Fills the 6 DoF covariance factor, as in L L^T factorization.
+// Covariance is given wrt the tangent space of current predictions
 StateCov make_tangent_space_uncertainty_tril(const PoseEstimator::Face &face)
 {
     StateCov tril = StateCov::eye();
-    tril(0,0) = face.center_stddev.x/img_scale;
-    tril(1,1) = face.center_stddev.y/img_scale;
-    tril(2,2) = face.size_stddev/img_scale;
+    set_minor<3,3>(tril, 0, 0, face.center_size_cov_tril / img_scale);
     set_minor<3,3>(tril, 3, 3, face.rotaxis_cov_tril);
     return tril;
 }
@@ -41,7 +48,7 @@ QuatPose apply_offset(const QuatPose& pose, const StateVec& offset)
 }
 
 
-PoseEstimator::Face apply_offset(const PoseEstimator::Face& face, const StateVec& offset)
+std::tuple<cv::Quatf, cv::Point2f, float> apply_offset(const PoseEstimator::Face& face, const StateVec& offset)
 {
     const cv::Quatf dr = cv::Quatf::createFromRvec(cv::Vec3f{ offset[3], offset[4], offset[5] });
     const auto r = face.rotation * dr;
@@ -57,14 +64,10 @@ PoseEstimator::Face apply_offset(const PoseEstimator::Face& face, const StateVec
     // is designed to handle non-linearities like this.
     const float sz = std::max(0.1f*face.size, face.size + offset[2]*img_scale);
 
-    return PoseEstimator::Face{
+    return {
         r,
-        {},
-        {},
         p,
-        {},
         sz,
-        {}
     };
 }
 
@@ -77,9 +80,9 @@ StateVec relative_to(const QuatPose& reference, const QuatPose& pose)
 }
 
 
-ukf_cv::SigmaPoints<6> relative_to(const QuatPose& pose, const std::array<QuatPose,num_sigmas>& sigmas)
+ukf_cv::SigmaPoints<dofs> relative_to(const QuatPose& pose, const std::array<QuatPose,num_sigmas>& sigmas)
 {
-    ukf_cv::SigmaPoints<6> out;
+    ukf_cv::SigmaPoints<dofs> out; // Beware, the number of points is != the number of DoFs.
     std::transform(sigmas.begin(), sigmas.end(), out.begin(), [&pose](const QuatPose& s) {
         return relative_to(pose, s);
     });
@@ -87,14 +90,14 @@ ukf_cv::SigmaPoints<6> relative_to(const QuatPose& pose, const std::array<QuatPo
 }
 
 
-std::array<QuatPose,num_sigmas> compute_world_pose_from_sigma_point(const PoseEstimator::Face& face, const ukf_cv::SigmaPoints<6>& sigmas, Face2WorldFunction face2world)
+std::array<QuatPose,num_sigmas> compute_world_pose_from_sigma_point(const PoseEstimator::Face& face, const ukf_cv::SigmaPoints<dofs>& sigmas, Face2WorldFunction face2world)
 {
     std::array<QuatPose,num_sigmas> out;
     std::transform(sigmas.begin(), sigmas.end(), out.begin(), [face2world=std::move(face2world), &face](const StateVec& sigma_point) {
         // First unpack the state vector and generate quaternion rotation w.r.t image space.
-        const auto sigma_face = apply_offset(face, sigma_point);
+        const auto [rotation, center, size] = apply_offset(face, sigma_point);
         // Then transform ...
-        QuatPose pose = face2world(sigma_face.rotation, sigma_face.center, sigma_face.size);
+        QuatPose pose = face2world(rotation, center, size);
         pose.pos /= world_scale;
         return pose;
     });
@@ -131,24 +134,32 @@ StateVec apply_filter_to_offset(const StateVec& offset, const StateCov& offset_c
 
 QuatPose apply_filter(const PoseEstimator::Face &face, const QuatPose& previous_pose_, float dt, Face2WorldFunction face2world, const FiltParams& params)
 {
-    ukf_cv::MerweScaledSigmaPoints<6> unscentedtrafo;
+    ukf_cv::MerweScaledSigmaPoints<dofs> unscentedtrafo;
     auto previous_pose = previous_pose_;
     previous_pose.pos /= world_scale;
 
-    // Here we have the covariance matrix for the offset from the observed values in `face`.
+    // Get 6 DoF covariance factor for the predictions in the face crop space.
     const auto cov_tril = make_tangent_space_uncertainty_tril(face);
 
-    // The filter uses an unscented transform to translate that into a distribution for the offset from the previous pose.
-    const ukf_cv::SigmaPoints<6> sigmas = unscentedtrafo.compute_sigmas(to_vec(StateVec::zeros()), cov_tril, true);
+    // Compute so called sigma points. These represent the distribution from the covariance matrix in terms of 
+    // sampling points.
+    const ukf_cv::SigmaPoints<dofs> sigmas = unscentedtrafo.compute_sigmas(to_vec(StateVec::zeros()), cov_tril, true);
 
+    // The filter uses an unscented transform to translate that into a distribution for the offset from the previous pose.
+    // The trick is to transform the sampling points and compute a covariance from them in the output space.
     // We have many of these sigma points. This is why that callback comes into play here.
+    // The transform to 3d world space is more than Face2WorldFunction because we also need to apply the sigma point (as
+    // a relative offset) to the pose in face crop space.
     const std::array<QuatPose,num_sigmas> pose_sigmas = compute_world_pose_from_sigma_point(face, sigmas, std::move(face2world));
 
-    const ukf_cv::SigmaPoints<6> deltas_sigmas = relative_to(previous_pose, pose_sigmas);
+    // Compute sigma points relative to the previous pose
+    const ukf_cv::SigmaPoints<dofs> deltas_sigmas = relative_to(previous_pose, pose_sigmas);
 
+    // Compute the mean offset from the last pose and the spread due to the networks uncertainty output.
     const auto [offset, offset_cov] = unscentedtrafo.compute_statistics(deltas_sigmas);
 
-    // Then the deadzone is applied to the offset and finally the previous pose is transformed by the offset to arrive at the final output.
+    // Then the deadzone is applied to the offset and finally the previous pose is transformed by the offset to arrive 
+    // at the final output.
     const StateVec scaled_offset = apply_filter_to_offset(offset, offset_cov, dt, params);
 
     QuatPose new_pose = apply_offset(previous_pose, scaled_offset);
diff --git a/tracker-neuralnet/model_adapters.cpp b/tracker-neuralnet/model_adapters.cpp
index a8580a89..f53478af 100644
--- a/tracker-neuralnet/model_adapters.cpp
+++ b/tracker-neuralnet/model_adapters.cpp
@@ -192,11 +192,15 @@ PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&ses
     if (session_.GetOutputCount() < 2)
         throw std::runtime_error("Invalid Model: must have at least two outputs");
 
-    // WARNING UB .. but still ...
-    // If the model was saved without meta data, it seems the version field is uninitialized.
-    // In that case reading from it is UB. However, in practice we will just get some arbitrary number
-    // which is hopefully different from the numbers used by models where the version is set.
-    if (model_version_ != 2 && model_version_ != 3)
+    // WARNING: Messy model compatibility issues!
+    // When reading the initial model release, it did not have the version field set.
+    // Reading it here will result in some unspecified value. It's probably UB due to
+    // reading uninitialized memory. But there is little choice.
+    // Now, detection of this old version is messy ... we have to guess based on the 
+    // number we get. Getting an uninitialized value matching a valid version is unlikely.
+    // But the real problem is that this line must be updated whenever we want to bump the
+    // version number!!
+    if (model_version_ <= 0 || model_version_ > 4)
         model_version_ = 1;
 
     const cv::Size input_image_shape = get_input_image_shape(session_);
@@ -224,10 +228,9 @@ PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&ses
         { "box", TensorSpec{ { 1, 4}, &output_box_[0], output_box_.rows } },
         { "rotaxis_scales_tril", TensorSpec{ {1, 3, 3}, output_rotaxis_scales_tril_.val, 9 }},
         { "rotaxis_std", TensorSpec{ {1, 3, 3}, output_rotaxis_scales_tril_.val, 9 }}, // TODO: Delete when old models aren't used any more
-        { "eyes", TensorSpec{ { 1, 2}, output_eyes_.val, output_eyes_.rows }},
-        { "pos_size_std", TensorSpec{ {1, 3}, output_coord_scales_.val, output_coord_scales_.rows}},
-        { "pos_size_scales", TensorSpec{ {1, 3}, output_coord_scales_.val, output_coord_scales_.rows}},
-        //{ "box_std", TensorSpec{ {1, 4}, output_box_scales_.val, output_box_scales_ .rows}}
+        { "pos_size_std", TensorSpec{ {1, 3}, output_coord_scales_std_.val, output_coord_scales_std_.rows}},
+        { "pos_size_scales", TensorSpec{ {1, 3}, output_coord_scales_std_.val, output_coord_scales_std_.rows}},
+        { "pos_size_scales_tril", TensorSpec{ {1, 3, 3}, output_coord_scales_tril_.val, 9}}
     };
 
     qDebug() << "Pose model inputs (" << session_.GetInputCount() << ")";
@@ -236,17 +239,17 @@ PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&ses
     output_c_names_.resize(session_.GetOutputCount());
     for (size_t i=0; i<session_.GetOutputCount(); ++i)
     {
-        std::string name = get_network_output_name(i);
+        const std::string name = get_network_output_name(i);
         const auto& output_info = session_.GetOutputTypeInfo(i);
         const auto& onnx_tensor_spec = output_info.GetTensorTypeAndShapeInfo();
-        auto my_tensor_spec = understood_outputs.find(name);
+        auto my_tensor_spec_it = understood_outputs.find(name);
 
         qDebug() << "\t" << name.c_str() << " (" << onnx_tensor_spec.GetShape() << ") dtype: " <<  onnx_tensor_spec.GetElementType() << " " <<
-            (my_tensor_spec != understood_outputs.end() ? "ok" : "unknown");
+            (my_tensor_spec_it != understood_outputs.end() ? "ok" : "unknown");
 
-        if (my_tensor_spec != understood_outputs.end())
+        if (my_tensor_spec_it != understood_outputs.end())
         {
-            TensorSpec& t = my_tensor_spec->second;
+            TensorSpec& t = my_tensor_spec_it->second;
             if (onnx_tensor_spec.GetShape() != t.shape || 
                 onnx_tensor_spec.GetElementType() != Ort::TypeToTensorType<float>::type)
                 throw std::runtime_error("Invalid output tensor spec for "s + name);
@@ -266,29 +269,9 @@ PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&ses
     has_uncertainty_ = understood_outputs.at("rotaxis_scales_tril").available ||
                        understood_outputs.at("rotaxis_std").available;
     has_uncertainty_ &= understood_outputs.at("pos_size_std").available ||
-                        understood_outputs.at("pos_size_scales").available;
-    //has_uncertainty_ &= understood_outputs.at("box_std").available;
-    has_eye_closed_detection_ = understood_outputs.at("eyes").available;
-
-    // FIXME: Recurrent states
-
-    // size_t num_regular_outputs = 2;
-
-    // num_recurrent_states_ = session_.GetInputCount()-1;
-    // if (session_.GetOutputCount()-num_regular_outputs != num_recurrent_states_)
-    //     throw std::runtime_error("Invalid Model: After regular inputs and outputs the model must have equal number of inputs and outputs for tensors holding hidden states of recurrent layers.");
-
-    // // Create tensors for recurrent state
-    // for (size_t i = 0; i < num_recurrent_states_; ++i)
-    // {
-    //     const auto& input_info = session_.GetInputTypeInfo(1+i);
-    //     const auto& output_info = session_.GetOutputTypeInfo(num_regular_outputs+i);
-    //     if (input_info.GetTensorTypeAndShapeInfo().GetShape() != 
-    //         output_info.GetTensorTypeAndShapeInfo().GetShape())
-    //         throw std::runtime_error("Invalid Model: Tensors for recurrent hidden states should have same shape on intput and output");
-    //     input_val_.push_back(create_tensor(input_info, allocator_));
-    //     output_val_.push_back(create_tensor(output_info, allocator_));
-    // }
+                        understood_outputs.at("pos_size_scales").available ||
+                        understood_outputs.at("pos_size_scales_tril").available;
+    pos_scale_uncertainty_is_matrix_ = understood_outputs.at("pos_size_scales_tril").available;
 
     input_names_.resize(session_.GetInputCount());
     input_c_names_.resize(session_.GetInputCount());
@@ -348,38 +331,32 @@ std::optional<PoseEstimator::Face> PoseEstimator::run(
         return {};
     }
 
-    for (size_t i = 0; i<num_recurrent_states_; ++i)
-    {
-        // Next step, the current output becomes the input.
-        // Thus we realize the recurrent connection.
-        // Only swaps the internal pointers. There is no copy of data.
-        std::swap(
-            output_val_[output_val_.size()-num_recurrent_states_+i],
-            input_val_[input_val_.size()-num_recurrent_states_+i]);
-    }
-
-    // FIXME: Execution time fluctuates wildly. 19 to 26 msec. Why?
-    //        The instructions are always the same. Maybe a memory allocation
-    //        issue. The ONNX api suggests that tensor are allocated in an
-    //        arena. Does that matter? Maybe the issue is something else?
-
     last_inference_time_ = t.elapsed_ms();
 
     // Perform coordinate transformation.
     // From patch-local normalized in [-1,1] to
-    // frame unnormalized pixel coordinatesettings.
+    // frame unnormalized pixel.
+
+    cv::Matx33f center_size_cov_tril = {};
+    if (has_uncertainty_)
+    {
+        if (pos_scale_uncertainty_is_matrix_)
+        {
+            center_size_cov_tril = output_coord_scales_tril_;
+        }
+        else
+        {
+            center_size_cov_tril(0,0) = output_coord_scales_std_[0];
+            center_size_cov_tril(1,1) = output_coord_scales_std_[1];
+            center_size_cov_tril(2,2) = output_coord_scales_std_[2];
+        }
+        center_size_cov_tril *= patch_size*0.5f;
+    }
 
     const cv::Point2f center = patch_center + 
         (0.5f*patch_size)*cv::Point2f{output_coord_[0], output_coord_[1]};
-
-    cv::Point2f center_stddev = {
-        (0.5f*patch_size)*output_coord_scales_[0],
-        (0.5f*patch_size)*output_coord_scales_[1] };
-
     const float size = patch_size*0.5f*output_coord_[2];
 
-    float size_stddev = patch_size*0.5f*output_coord_scales_[2];
-
     // Following Eigen which uses quat components in the order w, x, y, z.
     // As does OpenCV
     cv::Quatf rotation = { 
@@ -392,6 +369,9 @@ std::optional<PoseEstimator::Face> PoseEstimator::run(
     assert(output_rotaxis_scales_tril_(0, 1) == 0);
     assert(output_rotaxis_scales_tril_(0, 2) == 0);
     assert(output_rotaxis_scales_tril_(1, 2) == 0);
+    assert(center_size_cov_tril(0, 1) == 0);
+    assert(center_size_cov_tril(0, 2) == 0);
+    assert(center_size_cov_tril(1, 2) == 0);
 
     cv::Matx33f rotaxis_scales_tril = output_rotaxis_scales_tril_;
     
@@ -407,29 +387,9 @@ std::optional<PoseEstimator::Face> PoseEstimator::run(
         0.5f*patch_size*(output_box_[2]-output_box_[0]),
         0.5f*patch_size*(output_box_[3]-output_box_[1])
     };
-    // const RoiCorners outbox = {
-    //     patch_center + 0.5f*patch_size*cv::Point2f{output_box_[0], output_box_[1]},
-    //     patch_center + 0.5f*patch_size*cv::Point2f{output_box_[2], output_box_[3]}
-    // };
-    // RoiCorners outbox_stddev = {
-    //     0.5f*patch_size*cv::Point2f{output_box_scales_[0], output_box_scales_[1]},
-    //     0.5f*patch_size*cv::Point2f{output_box_scales_[2], output_box_scales_[3]}
-    // };
-
-    // Because the model is sensitive to closing eyes we increase the uncertainty 
-    // a lot to make the subsequent filtering smooth the output more. This should suppress
-    // "twitching" when the user blinks.
-    if (has_eye_closed_detection_)
-    {
-        const float eye_open = std::min(output_eyes_[0], output_eyes_[1]);
-        const float increase_factor = 1.f + 10.f * std::pow(1. - eye_open,4.f);
-        rotaxis_scales_tril *= increase_factor;
-        size_stddev *= increase_factor;
-        center_stddev *= increase_factor;
-    }
 
     return std::optional<Face>({
-        rotation, rotaxis_scales_tril, outbox, center, center_stddev, size, size_stddev
+        rotation, rotaxis_scales_tril, outbox, center, size, center_size_cov_tril
     });
 }
 
diff --git a/tracker-neuralnet/model_adapters.h b/tracker-neuralnet/model_adapters.h
index 48f2fa2c..c1aaa6de 100644
--- a/tracker-neuralnet/model_adapters.h
+++ b/tracker-neuralnet/model_adapters.h
@@ -50,9 +50,8 @@ class PoseEstimator
             cv::Matx33f rotaxis_cov_tril; // Lower triangular factor of Cholesky decomposition
             cv::Rect2f box;
             cv::Point2f center;
-            cv::Point2f center_stddev;
             float size;
-            float size_stddev;
+            cv::Matx33f center_size_cov_tril; // Lower triangular factor of Cholesky decomposition
         };
 
         PoseEstimator(Ort::MemoryInfo &allocator_info,
@@ -84,16 +83,15 @@ class PoseEstimator
         cv::Vec<float, 4> output_quat_{};   //  Quaternion output
         cv::Vec<float, 4> output_box_{};    // Bounding box output
         cv::Matx33f output_rotaxis_scales_tril_{}; // Lower triangular matrix of LLT factorization of covariance of rotation vector as offset from output quaternion
-        cv::Vec<float, 2> output_eyes_{};
-        cv::Vec<float, 3> output_coord_scales_{};
+        cv::Matx33f output_coord_scales_tril_{}; // Lower triangular factor
+        cv::Vec3f output_coord_scales_std_{}; // Depending on the model, alternatively a 3d vector with standard deviations.
         std::vector<Ort::Value> output_val_; // Tensors to put the model outputs in.
         std::vector<std::string> output_names_; // Refers to the names in the onnx model.
         std::vector<const char *> output_c_names_; // Refers to the C names in the onnx model.
         // More bookkeeping
-        size_t num_recurrent_states_ = 0;
         double last_inference_time_ = 0;
         bool has_uncertainty_ = false;
-        bool has_eye_closed_detection_ = false;
+        bool pos_scale_uncertainty_is_matrix_ = false;
 };
 
 
-- 
cgit v1.2.3