3 files changed, 79 insertions, 110 deletions
diff --git a/tracker-neuralnet/deadzone_filter.cpp b/tracker-neuralnet/deadzone_filter.cpp
index b41afdba..fa96eeb3 100644
--- a/tracker-neuralnet/deadzone_filter.cpp
+++ b/tracker-neuralnet/deadzone_filter.cpp
@@ -3,6 +3,7 @@
 #include "opencv_contrib.h"
 #include "unscented_trafo.h"
 
+#include <tuple>
 #include <opencv2/core/base.hpp>
 #include <opencv2/core/matx.hpp>
 #include <opencv2/core/quaternion.hpp>
@@ -12,19 +13,25 @@ namespace neuralnet_tracker_ns
 
 using namespace cvcontrib;
 
-using StateVec = cv::Vec<float,6>;
-using StateCov = cv::Matx<float,6,6>;
+// Number of degrees of freedom of position and rotation
+static constexpr int dofs = 6;
 
-static constexpr int num_sigmas = ukf_cv::MerweScaledSigmaPoints<6>::num_sigmas;
+using StateVec = cv::Vec<float,dofs>;
+using StateCov = cv::Matx<float,dofs,dofs>;
+
+static constexpr int num_sigmas = ukf_cv::MerweScaledSigmaPoints<dofs>::num_sigmas;
+// Rescaling factor for position/size living in the space of the face crop.
+// Applied prior to application of UKF to prevent numerical problems.
 static constexpr float img_scale = 200.f;
+// Similar rescaling factor for position/size that live in world space.
 static constexpr float world_scale = 1000.f; // mm
 
+// Fills the 6 DoF covariance factor, as in L L^T factorization.
+// Covariance is given wrt the tangent space of current predictions
 StateCov make_tangent_space_uncertainty_tril(const PoseEstimator::Face &face)
 {
     StateCov tril = StateCov::eye();
-    tril(0,0) = face.center_stddev.x/img_scale;
-    tril(1,1) = face.center_stddev.y/img_scale;
-    tril(2,2) = face.size_stddev/img_scale;
+    set_minor<3,3>(tril, 0, 0, face.center_size_cov_tril / img_scale);
     set_minor<3,3>(tril, 3, 3, face.rotaxis_cov_tril);
     return tril;
 }
@@ -41,7 +48,7 @@ QuatPose apply_offset(const QuatPose& pose, const StateVec& offset)
 }
 
 
-PoseEstimator::Face apply_offset(const PoseEstimator::Face& face, const StateVec& offset)
+std::tuple<cv::Quatf, cv::Point2f, float> apply_offset(const PoseEstimator::Face& face, const StateVec& offset)
 {
     const cv::Quatf dr = cv::Quatf::createFromRvec(cv::Vec3f{ offset[3], offset[4], offset[5] });
     const auto r = face.rotation * dr;
@@ -57,14 +64,10 @@ PoseEstimator::Face apply_offset(const PoseEstimator::Face& face, const StateVec
     // is designed to handle non-linearities like this.
     const float sz = std::max(0.1f*face.size, face.size + offset[2]*img_scale);
 
-    return PoseEstimator::Face{
+    return {
         r,
-        {},
-        {},
         p,
-        {},
         sz,
-        {}
     };
 }
 
@@ -77,9 +80,9 @@ StateVec relative_to(const QuatPose& reference, const QuatPose& pose)
 }
 
 
-ukf_cv::SigmaPoints<6> relative_to(const QuatPose& pose, const std::array<QuatPose,num_sigmas>& sigmas)
+ukf_cv::SigmaPoints<dofs> relative_to(const QuatPose& pose, const std::array<QuatPose,num_sigmas>& sigmas)
 {
-    ukf_cv::SigmaPoints<6> out;
+    ukf_cv::SigmaPoints<dofs> out; // Beware, the number of points is != the number of DoFs.
     std::transform(sigmas.begin(), sigmas.end(), out.begin(), [&pose](const QuatPose& s) {
         return relative_to(pose, s);
     });
@@ -87,14 +90,14 @@ ukf_cv::SigmaPoints<6> relative_to(const QuatPose& pose, const std::array<QuatPo
 }
 
 
-std::array<QuatPose,num_sigmas> compute_world_pose_from_sigma_point(const PoseEstimator::Face& face, const ukf_cv::SigmaPoints<6>& sigmas, Face2WorldFunction face2world)
+std::array<QuatPose,num_sigmas> compute_world_pose_from_sigma_point(const PoseEstimator::Face& face, const ukf_cv::SigmaPoints<dofs>& sigmas, Face2WorldFunction face2world)
 {
     std::array<QuatPose,num_sigmas> out;
     std::transform(sigmas.begin(), sigmas.end(), out.begin(), [face2world=std::move(face2world), &face](const StateVec& sigma_point) {
         // First unpack the state vector and generate quaternion rotation w.r.t image space.
-        const auto sigma_face = apply_offset(face, sigma_point);
+        const auto [rotation, center, size] = apply_offset(face, sigma_point);
         // Then transform ...
-        QuatPose pose = face2world(sigma_face.rotation, sigma_face.center, sigma_face.size);
+        QuatPose pose = face2world(rotation, center, size);
         pose.pos /= world_scale;
         return pose;
     });
@@ -131,24 +134,32 @@ StateVec apply_filter_to_offset(const StateVec& offset, const StateCov& offset_c
 
 QuatPose apply_filter(const PoseEstimator::Face &face, const QuatPose& previous_pose_, float dt, Face2WorldFunction face2world, const FiltParams& params)
 {
-    ukf_cv::MerweScaledSigmaPoints<6> unscentedtrafo;
+    ukf_cv::MerweScaledSigmaPoints<dofs> unscentedtrafo;
     auto previous_pose = previous_pose_;
     previous_pose.pos /= world_scale;
 
-    // Here we have the covariance matrix for the offset from the observed values in `face`.
+    // Get 6 DoF covariance factor for the predictions in the face crop space.
     const auto cov_tril = make_tangent_space_uncertainty_tril(face);
 
-    // The filter uses an unscented transform to translate that into a distribution for the offset from the previous pose.
-    const ukf_cv::SigmaPoints<6> sigmas = unscentedtrafo.compute_sigmas(to_vec(StateVec::zeros()), cov_tril, true);
+    // Compute so called sigma points. These represent the distribution from the covariance matrix in terms of 
+    // sampling points.
+    const ukf_cv::SigmaPoints<dofs> sigmas = unscentedtrafo.compute_sigmas(to_vec(StateVec::zeros()), cov_tril, true);
 
+    // The filter uses an unscented transform to translate that into a distribution for the offset from the previous pose.
+    // The trick is to transform the sampling points and compute a covariance from them in the output space.
     // We have many of these sigma points. This is why that callback comes into play here.
+    // The transform to 3d world space is more than Face2WorldFunction because we also need to apply the sigma point (as
+    // a relative offset) to the pose in face crop space.
     const std::array<QuatPose,num_sigmas> pose_sigmas = compute_world_pose_from_sigma_point(face, sigmas, std::move(face2world));
 
-    const ukf_cv::SigmaPoints<6> deltas_sigmas = relative_to(previous_pose, pose_sigmas);
+    // Compute sigma points relative to the previous pose
+    const ukf_cv::SigmaPoints<dofs> deltas_sigmas = relative_to(previous_pose, pose_sigmas);
 
+    // Compute the mean offset from the last pose and the spread due to the networks uncertainty output.
     const auto [offset, offset_cov] = unscentedtrafo.compute_statistics(deltas_sigmas);
 
-    // Then the deadzone is applied to the offset and finally the previous pose is transformed by the offset to arrive at the final output.
+    // Then the deadzone is applied to the offset and finally the previous pose is transformed by the offset to arrive 
+    // at the final output.
     const StateVec scaled_offset = apply_filter_to_offset(offset, offset_cov, dt, params);
 
     QuatPose new_pose = apply_offset(previous_pose, scaled_offset);
diff --git a/tracker-neuralnet/model_adapters.cpp b/tracker-neuralnet/model_adapters.cpp
index a8580a89..f53478af 100644
--- a/tracker-neuralnet/model_adapters.cpp
+++ b/tracker-neuralnet/model_adapters.cpp
@@ -192,11 +192,15 @@ PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&ses
     if (session_.GetOutputCount() < 2)
         throw std::runtime_error("Invalid Model: must have at least two outputs");
 
-    // WARNING UB .. but still ...
-    // If the model was saved without meta data, it seems the version field is uninitialized.
-    // In that case reading from it is UB. However, in practice we will just get some arbitrary number
-    // which is hopefully different from the numbers used by models where the version is set.
-    if (model_version_ != 2 && model_version_ != 3)
+    // WARNING: Messy model compatibility issues!
+    // When reading the initial model release, it did not have the version field set.
+    // Reading it here will result in some unspecified value. It's probably UB due to
+    // reading uninitialized memory. But there is little choice.
+    // Now, detection of this old version is messy ... we have to guess based on the 
+    // number we get. Getting an uninitialized value matching a valid version is unlikely.
+    // But the real problem is that this line must be updated whenever we want to bump the
+    // version number!!
+    if (model_version_ <= 0 || model_version_ > 4)
         model_version_ = 1;
 
     const cv::Size input_image_shape = get_input_image_shape(session_);
@@ -224,10 +228,9 @@ PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&ses
         { "box", TensorSpec{ { 1, 4}, &output_box_[0], output_box_.rows } },
         { "rotaxis_scales_tril", TensorSpec{ {1, 3, 3}, output_rotaxis_scales_tril_.val, 9 }},
         { "rotaxis_std", TensorSpec{ {1, 3, 3}, output_rotaxis_scales_tril_.val, 9 }}, // TODO: Delete when old models aren't used any more
-        { "eyes", TensorSpec{ { 1, 2}, output_eyes_.val, output_eyes_.rows }},
-        { "pos_size_std", TensorSpec{ {1, 3}, output_coord_scales_.val, output_coord_scales_.rows}},
-        { "pos_size_scales", TensorSpec{ {1, 3}, output_coord_scales_.val, output_coord_scales_.rows}},
-        //{ "box_std", TensorSpec{ {1, 4}, output_box_scales_.val, output_box_scales_ .rows}}
+        { "pos_size_std", TensorSpec{ {1, 3}, output_coord_scales_std_.val, output_coord_scales_std_.rows}},
+        { "pos_size_scales", TensorSpec{ {1, 3}, output_coord_scales_std_.val, output_coord_scales_std_.rows}},
+        { "pos_size_scales_tril", TensorSpec{ {1, 3, 3}, output_coord_scales_tril_.val, 9}}
     };
 
     qDebug() << "Pose model inputs (" << session_.GetInputCount() << ")";
@@ -236,17 +239,17 @@ PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&ses
     output_c_names_.resize(session_.GetOutputCount());
     for (size_t i=0; i<session_.GetOutputCount(); ++i)
     {
-        std::string name = get_network_output_name(i);
+        const std::string name = get_network_output_name(i);
         const auto& output_info = session_.GetOutputTypeInfo(i);
         const auto& onnx_tensor_spec = output_info.GetTensorTypeAndShapeInfo();
-        auto my_tensor_spec = understood_outputs.find(name);
+        auto my_tensor_spec_it = understood_outputs.find(name);
 
         qDebug() << "\t" << name.c_str() << " (" << onnx_tensor_spec.GetShape() << ") dtype: " <<  onnx_tensor_spec.GetElementType() << " " <<
-            (my_tensor_spec != understood_outputs.end() ? "ok" : "unknown");
+            (my_tensor_spec_it != understood_outputs.end() ? "ok" : "unknown");
 
-        if (my_tensor_spec != understood_outputs.end())
+        if (my_tensor_spec_it != understood_outputs.end())
         {
-            TensorSpec& t = my_tensor_spec->second;
+            TensorSpec& t = my_tensor_spec_it->second;
             if (onnx_tensor_spec.GetShape() != t.shape || 
                 onnx_tensor_spec.GetElementType() != Ort::TypeToTensorType<float>::type)
                 throw std::runtime_error("Invalid output tensor spec for "s + name);
@@ -266,29 +269,9 @@ PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&ses
     has_uncertainty_ = understood_outputs.at("rotaxis_scales_tril").available ||
                        understood_outputs.at("rotaxis_std").available;
     has_uncertainty_ &= understood_outputs.at("pos_size_std").available ||
-                        understood_outputs.at("pos_size_scales").available;
-    //has_uncertainty_ &= understood_outputs.at("box_std").available;
-    has_eye_closed_detection_ = understood_outputs.at("eyes").available;
-
-    // FIXME: Recurrent states
-
-    // size_t num_regular_outputs = 2;
-
-    // num_recurrent_states_ = session_.GetInputCount()-1;
-    // if (session_.GetOutputCount()-num_regular_outputs != num_recurrent_states_)
-    //     throw std::runtime_error("Invalid Model: After regular inputs and outputs the model must have equal number of inputs and outputs for tensors holding hidden states of recurrent layers.");
-
-    // // Create tensors for recurrent state
-    // for (size_t i = 0; i < num_recurrent_states_; ++i)
-    // {
-    //     const auto& input_info = session_.GetInputTypeInfo(1+i);
-    //     const auto& output_info = session_.GetOutputTypeInfo(num_regular_outputs+i);
-    //     if (input_info.GetTensorTypeAndShapeInfo().GetShape() != 
-    //         output_info.GetTensorTypeAndShapeInfo().GetShape())
-    //         throw std::runtime_error("Invalid Model: Tensors for recurrent hidden states should have same shape on intput and output");
-    //     input_val_.push_back(create_tensor(input_info, allocator_));
-    //     output_val_.push_back(create_tensor(output_info, allocator_));
-    // }
+                        understood_outputs.at("pos_size_scales").available ||
+                        understood_outputs.at("pos_size_scales_tril").available;
+    pos_scale_uncertainty_is_matrix_ = understood_outputs.at("pos_size_scales_tril").available;
 
     input_names_.resize(session_.GetInputCount());
     input_c_names_.resize(session_.GetInputCount());
@@ -348,38 +331,32 @@ std::optional<PoseEstimator::Face> PoseEstimator::run(
         return {};
     }
 
-    for (size_t i = 0; i<num_recurrent_states_; ++i)
-    {
-        // Next step, the current output becomes the input.
-        // Thus we realize the recurrent connection.
-        // Only swaps the internal pointers. There is no copy of data.
-        std::swap(
-            output_val_[output_val_.size()-num_recurrent_states_+i],
-            input_val_[input_val_.size()-num_recurrent_states_+i]);
-    }
-
-    // FIXME: Execution time fluctuates wildly. 19 to 26 msec. Why?
-    //        The instructions are always the same. Maybe a memory allocation
-    //        issue. The ONNX api suggests that tensor are allocated in an
-    //        arena. Does that matter? Maybe the issue is something else?
-
     last_inference_time_ = t.elapsed_ms();
 
     // Perform coordinate transformation.
     // From patch-local normalized in [-1,1] to
-    // frame unnormalized pixel coordinatesettings.
+    // frame unnormalized pixel.
+
+    cv::Matx33f center_size_cov_tril = {};
+    if (has_uncertainty_)
+    {
+        if (pos_scale_uncertainty_is_matrix_)
+        {
+            center_size_cov_tril = output_coord_scales_tril_;
+        }
+        else
+        {
+            center_size_cov_tril(0,0) = output_coord_scales_std_[0];
+            center_size_cov_tril(1,1) = output_coord_scales_std_[1];
+            center_size_cov_tril(2,2) = output_coord_scales_std_[2];
+        }
+        center_size_cov_tril *= patch_size*0.5f;
+    }
 
     const cv::Point2f center = patch_center + 
         (0.5f*patch_size)*cv::Point2f{output_coord_[0], output_coord_[1]};
-
-    cv::Point2f center_stddev = {
-        (0.5f*patch_size)*output_coord_scales_[0],
-        (0.5f*patch_size)*output_coord_scales_[1] };
-
     const float size = patch_size*0.5f*output_coord_[2];
 
-    float size_stddev = patch_size*0.5f*output_coord_scales_[2];
-
     // Following Eigen which uses quat components in the order w, x, y, z.
     // As does OpenCV
     cv::Quatf rotation = { 
@@ -392,6 +369,9 @@ std::optional<PoseEstimator::Face> PoseEstimator::run(
     assert(output_rotaxis_scales_tril_(0, 1) == 0);
     assert(output_rotaxis_scales_tril_(0, 2) == 0);
     assert(output_rotaxis_scales_tril_(1, 2) == 0);
+    assert(center_size_cov_tril(0, 1) == 0);
+    assert(center_size_cov_tril(0, 2) == 0);
+    assert(center_size_cov_tril(1, 2) == 0);
 
     cv::Matx33f rotaxis_scales_tril = output_rotaxis_scales_tril_;
     
@@ -407,29 +387,9 @@ std::optional<PoseEstimator::Face> PoseEstimator::run(
         0.5f*patch_size*(output_box_[2]-output_box_[0]),
         0.5f*patch_size*(output_box_[3]-output_box_[1])
     };
-    // const RoiCorners outbox = {
-    //     patch_center + 0.5f*patch_size*cv::Point2f{output_box_[0], output_box_[1]},
-    //     patch_center + 0.5f*patch_size*cv::Point2f{output_box_[2], output_box_[3]}
-    // };
-    // RoiCorners outbox_stddev = {
-    //     0.5f*patch_size*cv::Point2f{output_box_scales_[0], output_box_scales_[1]},
-    //     0.5f*patch_size*cv::Point2f{output_box_scales_[2], output_box_scales_[3]}
-    // };
-
-    // Because the model is sensitive to closing eyes we increase the uncertainty 
-    // a lot to make the subsequent filtering smooth the output more. This should suppress
-    // "twitching" when the user blinks.
-    if (has_eye_closed_detection_)
-    {
-        const float eye_open = std::min(output_eyes_[0], output_eyes_[1]);
-        const float increase_factor = 1.f + 10.f * std::pow(1. - eye_open,4.f);
-        rotaxis_scales_tril *= increase_factor;
-        size_stddev *= increase_factor;
-        center_stddev *= increase_factor;
-    }
 
     return std::optional<Face>({
-        rotation, rotaxis_scales_tril, outbox, center, center_stddev, size, size_stddev
+        rotation, rotaxis_scales_tril, outbox, center, size, center_size_cov_tril
     });
 }
 
diff --git a/tracker-neuralnet/model_adapters.h b/tracker-neuralnet/model_adapters.h
index 48f2fa2c..c1aaa6de 100644
--- a/tracker-neuralnet/model_adapters.h
+++ b/tracker-neuralnet/model_adapters.h
@@ -50,9 +50,8 @@ class PoseEstimator
             cv::Matx33f rotaxis_cov_tril; // Lower triangular factor of Cholesky decomposition
             cv::Rect2f box;
             cv::Point2f center;
-            cv::Point2f center_stddev;
             float size;
-            float size_stddev;
+            cv::Matx33f center_size_cov_tril; // Lower triangular factor of Cholesky decomposition
         };
 
         PoseEstimator(Ort::MemoryInfo &allocator_info,
@@ -84,16 +83,15 @@ class PoseEstimator
         cv::Vec<float, 4> output_quat_{};   //  Quaternion output
         cv::Vec<float, 4> output_box_{};    // Bounding box output
         cv::Matx33f output_rotaxis_scales_tril_{}; // Lower triangular matrix of LLT factorization of covariance of rotation vector as offset from output quaternion
-        cv::Vec<float, 2> output_eyes_{};
-        cv::Vec<float, 3> output_coord_scales_{};
+        cv::Matx33f output_coord_scales_tril_{}; // Lower triangular factor
+        cv::Vec3f output_coord_scales_std_{}; // Depending on the model, alternatively a 3d vector with standard deviations.
         std::vector<Ort::Value> output_val_; // Tensors to put the model outputs in.
         std::vector<std::string> output_names_; // Refers to the names in the onnx model.
         std::vector<const char *> output_c_names_; // Refers to the C names in the onnx model.
         // More bookkeeping
-        size_t num_recurrent_states_ = 0;
         double last_inference_time_ = 0;
         bool has_uncertainty_ = false;
-        bool has_eye_closed_detection_ = false;
+        bool pos_scale_uncertainty_is_matrix_ = false;
 };