From 25af3db8ba560d86f78aa4bfd10039588b7bab82 Mon Sep 17 00:00:00 2001
From: Michael Welter <michael@welter-4d.de>
Date: Sat, 26 Feb 2022 22:20:40 +0100
Subject: tracker/nn: Support new coordinate conventions for head rotation

---
 tracker-neuralnet/ftnoir_tracker_neuralnet.cpp | 94 ++++++++++++++++----------
 tracker-neuralnet/ftnoir_tracker_neuralnet.h   |  3 +-
 2 files changed, 59 insertions(+), 38 deletions(-)

(limited to 'tracker-neuralnet')
diff --git a/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp b/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp
index 320c8f23..06874e6c 100644
--- a/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp
+++ b/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp
@@ -34,12 +34,14 @@
 // https://github.com/microsoft/onnxruntime/blob/master/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/C_Api_Sample.cpp
 // https://github.com/leimao/ONNX-Runtime-Inference/blob/main/src/inference.cpp
 
-namespace
+namespace neuralnet_tracker_ns
 {
 
+
 using numeric_types::vec3;
 using numeric_types::vec2;
 using numeric_types::mat33;
+using quat = std::array<numeric_types::f,4>;
 
 // Minimal difference if at all going from 1 to 2 threads.
 static constexpr int num_threads = 1;
@@ -136,6 +138,42 @@ mat33 quaternion_to_mat33(const std::array<float,4> quat)
 }
 
 
+vec3 image_to_world(float x, float y, float size, float reference_size_in_mm, const cv::Size2i& image_size, const CamIntrinsics& intrinsics)
+{
+    // Compute the location the network outputs in 3d space.
+    const float xpos = -(intrinsics.focal_length_w * image_size.width * 0.5f) / size * reference_size_in_mm;
+    const float zpos = (x / image_size.width * 2.f - 1.f) * xpos / intrinsics.focal_length_w;
+    const float ypos = (y / image_size.height * 2.f - 1.f) * xpos / intrinsics.focal_length_h;
+    return {xpos, ypos, zpos};
+}
+
+
+vec2 world_to_image(const vec3& pos, const cv::Size2i& image_size, const CamIntrinsics& intrinsics)
+{
+    const float xscr = pos[2] / pos[0] * intrinsics.focal_length_w;
+    const float yscr = pos[1] / pos[0] * intrinsics.focal_length_h;
+    const float x = (xscr+1.)*0.5f*image_size.width;
+    const float y = (yscr+1.)*0.5f*image_size.height;
+    return {x, y};
+}
+
+
+quat image_to_world(quat q)
+{
+    std::swap(q[1], q[3]);
+    q[1] = -q[1];
+    q[2] = -q[2];
+    q[3] = -q[3];
+    return q;
+}
+
+quat world_to_image(quat q)
+{
+    // It's its own inverse.
+    return image_to_world(q);
+}
+
+
 template<class T>
 T iou(const cv::Rect_<T> &a, const cv::Rect_<T> &b)
 {
@@ -158,13 +196,6 @@ cv::Size get_input_image_shape(const Ort::Session &session)
 }
 
 
-} // namespace
-
-
-namespace neuralnet_tracker_ns
-{
-
-
 int enum_to_fps(int value)
 {
     switch (value)
@@ -235,8 +266,9 @@ double Localizer::last_inference_time_millis() const
 }
 
 
-PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&_session) :
-    session{std::move(_session)}
+PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&_session) 
+    : session{std::move(_session)}
+    , model_version(session.GetModelMetadata().GetVersion())
 {
     const cv::Size input_image_shape = get_input_image_shape(session);
 
@@ -296,7 +328,7 @@ std::optional<PoseEstimator::Face> PoseEstimator::run(
     const cv::Mat &frame, const cv::Rect &box)
 {
     cv::Mat cropped;
-    
+
     const int patch_size = std::max(box.width, box.height)*1.05;
     const cv::Point2f patch_center = {
         std::clamp<float>(box.x + 0.5f*box.width, 0.f, frame.cols),
@@ -308,7 +340,7 @@ std::optional<PoseEstimator::Face> PoseEstimator::run(
     // Have to catch this case.
     if (cropped.rows != patch_size || cropped.cols != patch_size)
         return {};
-    
+
     auto p = input_mat.ptr(0);
 
     cv::resize(cropped, scaled_frame, scaled_frame.size(), 0, 0, cv::INTER_AREA);
@@ -359,12 +391,18 @@ std::optional<PoseEstimator::Face> PoseEstimator::run(
     const float size = patch_size*0.5f*output_coord[2];
 
     // Following Eigen which uses quat components in the order w, x, y, z.
-    const std::array<float,4> rotation = { 
+    quat rotation = { 
         output_quat[3], 
         output_quat[0], 
         output_quat[1], 
         output_quat[2] };
 
+    if (model_version < 2)
+    {
+        // Due to a change in coordinate conventions
+        rotation = world_to_image(rotation);
+    }
+
     const cv::Rect2f outbox = {
         patch_center.x + (0.5f*patch_size)*output_box[0],
         patch_center.y + (0.5f*patch_size)*output_box[1],
@@ -453,7 +491,8 @@ Affine neuralnet_tracker::compute_pose(const PoseEstimator::Face &face) const
         normalize(face.center, frame.rows, frame.cols),
         intrinsics.focal_length_w);
 
-    const mat33 m = rot_correction * quaternion_to_mat33(face.rotation);
+    const mat33 m = rot_correction * quaternion_to_mat33(
+        image_to_world(face.rotation));
 
     /*
          
@@ -469,7 +508,10 @@ Affine neuralnet_tracker::compute_pose(const PoseEstimator::Face &face) const
     */
 
     // Compute the location the network outputs in 3d space.
-    const vec3 face_world_pos = image_to_world(face.center.x, face.center.y, face.size, head_size_mm);
+    const vec3 face_world_pos = image_to_world(
+        face.center.x, face.center.y, face.size, head_size_mm,
+        frame.size(),
+        intrinsics);
 
     // But this is in general not the location of the rotation joint in the neck.
     // So we need an extra offset. Which we determine by solving
@@ -520,7 +562,7 @@ void neuralnet_tracker::draw_gizmos(
         draw_coord_line(2, {255, 0, 0});
 
         // Draw the computed joint position
-        auto xy = world_to_image(pose.t);
+        auto xy = world_to_image(pose.t, frame.size(), intrinsics);
         cv::circle(frame, cv::Point(xy[0],xy[1]), 5, cv::Scalar(0,0,255), -1);
     }
 
@@ -658,26 +700,6 @@ void neuralnet_tracker::set_intrinsics()
 }
 
 
-vec3 neuralnet_tracker::image_to_world(float x, float y, float size, float real_size) const
-{
-    // Compute the location the network outputs in 3d space.
-    const float xpos = -(intrinsics.focal_length_w * frame.cols * 0.5f) / size * real_size;
-    const float zpos = (x / frame.cols * 2.f - 1.f) * xpos / intrinsics.focal_length_w;
-    const float ypos = (y / frame.rows * 2.f - 1.f) * xpos / intrinsics.focal_length_h;
-    return {xpos, ypos, zpos};
-}
-
-
-vec2 neuralnet_tracker::world_to_image(const vec3& pos) const
-{
-    const float xscr = pos[2] / pos[0] * intrinsics.focal_length_w;
-    const float yscr = pos[1] / pos[0] * intrinsics.focal_length_h;
-    const float x = (xscr+1.)*0.5f*frame.cols;
-    const float y = (yscr+1.)*0.5f*frame.rows;
-    return {x, y};
-}
-
-
 void neuralnet_tracker::run()
 {
     if (!open_camera())
diff --git a/tracker-neuralnet/ftnoir_tracker_neuralnet.h b/tracker-neuralnet/ftnoir_tracker_neuralnet.h
index c74a7b1d..b6405d1a 100644
--- a/tracker-neuralnet/ftnoir_tracker_neuralnet.h
+++ b/tracker-neuralnet/ftnoir_tracker_neuralnet.h
@@ -131,6 +131,7 @@ class PoseEstimator
             Ort::Value{nullptr}, 
             Ort::Value{nullptr}};
         double last_inference_time = 0;
+        int64_t model_version = 0;
 };
 
 
@@ -160,8 +161,6 @@ private:
     void update_fps(double dt);
 
     Affine compute_pose(const PoseEstimator::Face &face) const;
-    numeric_types::vec3 image_to_world(float x, float y, float size, float real_size) const;
-    numeric_types::vec2 world_to_image(const numeric_types::vec3& p) const;
 
     settings s;
     std::optional<Localizer> localizer;
-- 
cgit v1.2.3