diff options
author | Michael Welter <michael@welter-4d.de> | 2022-02-26 22:20:40 +0100 |
---|---|---|
committer | Michael Welter <michael@welter-4d.de> | 2022-05-15 16:53:44 +0200 |
commit | 25af3db8ba560d86f78aa4bfd10039588b7bab82 (patch) | |
tree | d81085a298a66ef39dd9c2ba811d587119d5a501 | |
parent | 4697a0e44536ae9dc169aab54df26e5dbebdec7c (diff) |
tracker/nn: Support new coordinate conventions for head rotation
-rw-r--r-- | tracker-neuralnet/ftnoir_tracker_neuralnet.cpp | 94 | ||||
-rw-r--r-- | tracker-neuralnet/ftnoir_tracker_neuralnet.h | 3 |
2 files changed, 59 insertions, 38 deletions
diff --git a/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp b/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp index 320c8f23..06874e6c 100644 --- a/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp +++ b/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp @@ -34,12 +34,14 @@ // https://github.com/microsoft/onnxruntime/blob/master/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/C_Api_Sample.cpp // https://github.com/leimao/ONNX-Runtime-Inference/blob/main/src/inference.cpp -namespace +namespace neuralnet_tracker_ns { + using numeric_types::vec3; using numeric_types::vec2; using numeric_types::mat33; +using quat = std::array<numeric_types::f,4>; // Minimal difference if at all going from 1 to 2 threads. static constexpr int num_threads = 1; @@ -136,6 +138,42 @@ mat33 quaternion_to_mat33(const std::array<float,4> quat) } +vec3 image_to_world(float x, float y, float size, float reference_size_in_mm, const cv::Size2i& image_size, const CamIntrinsics& intrinsics) +{ + // Compute the location the network outputs in 3d space. + const float xpos = -(intrinsics.focal_length_w * image_size.width * 0.5f) / size * reference_size_in_mm; + const float zpos = (x / image_size.width * 2.f - 1.f) * xpos / intrinsics.focal_length_w; + const float ypos = (y / image_size.height * 2.f - 1.f) * xpos / intrinsics.focal_length_h; + return {xpos, ypos, zpos}; +} + + +vec2 world_to_image(const vec3& pos, const cv::Size2i& image_size, const CamIntrinsics& intrinsics) +{ + const float xscr = pos[2] / pos[0] * intrinsics.focal_length_w; + const float yscr = pos[1] / pos[0] * intrinsics.focal_length_h; + const float x = (xscr+1.)*0.5f*image_size.width; + const float y = (yscr+1.)*0.5f*image_size.height; + return {x, y}; +} + + +quat image_to_world(quat q) +{ + std::swap(q[1], q[3]); + q[1] = -q[1]; + q[2] = -q[2]; + q[3] = -q[3]; + return q; +} + +quat world_to_image(quat q) +{ + // It's its own inverse. + return image_to_world(q); +} + + template<class T> T iou(const cv::Rect_<T> &a, const cv::Rect_<T> &b) { @@ -158,13 +196,6 @@ cv::Size get_input_image_shape(const Ort::Session &session) } -} // namespace - - -namespace neuralnet_tracker_ns -{ - - int enum_to_fps(int value) { switch (value) @@ -235,8 +266,9 @@ double Localizer::last_inference_time_millis() const } -PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&_session) : - session{std::move(_session)} +PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&_session) + : session{std::move(_session)} + , model_version(session.GetModelMetadata().GetVersion()) { const cv::Size input_image_shape = get_input_image_shape(session); @@ -296,7 +328,7 @@ std::optional<PoseEstimator::Face> PoseEstimator::run( const cv::Mat &frame, const cv::Rect &box) { cv::Mat cropped; - + const int patch_size = std::max(box.width, box.height)*1.05; const cv::Point2f patch_center = { std::clamp<float>(box.x + 0.5f*box.width, 0.f, frame.cols), @@ -308,7 +340,7 @@ std::optional<PoseEstimator::Face> PoseEstimator::run( // Have to catch this case. if (cropped.rows != patch_size || cropped.cols != patch_size) return {}; - + auto p = input_mat.ptr(0); cv::resize(cropped, scaled_frame, scaled_frame.size(), 0, 0, cv::INTER_AREA); @@ -359,12 +391,18 @@ std::optional<PoseEstimator::Face> PoseEstimator::run( const float size = patch_size*0.5f*output_coord[2]; // Following Eigen which uses quat components in the order w, x, y, z. - const std::array<float,4> rotation = { + quat rotation = { output_quat[3], output_quat[0], output_quat[1], output_quat[2] }; + if (model_version < 2) + { + // Due to a change in coordinate conventions + rotation = world_to_image(rotation); + } + const cv::Rect2f outbox = { patch_center.x + (0.5f*patch_size)*output_box[0], patch_center.y + (0.5f*patch_size)*output_box[1], @@ -453,7 +491,8 @@ Affine neuralnet_tracker::compute_pose(const PoseEstimator::Face &face) const normalize(face.center, frame.rows, frame.cols), intrinsics.focal_length_w); - const mat33 m = rot_correction * quaternion_to_mat33(face.rotation); + const mat33 m = rot_correction * quaternion_to_mat33( + image_to_world(face.rotation)); /* @@ -469,7 +508,10 @@ Affine neuralnet_tracker::compute_pose(const PoseEstimator::Face &face) const */ // Compute the location the network outputs in 3d space. - const vec3 face_world_pos = image_to_world(face.center.x, face.center.y, face.size, head_size_mm); + const vec3 face_world_pos = image_to_world( + face.center.x, face.center.y, face.size, head_size_mm, + frame.size(), + intrinsics); // But this is in general not the location of the rotation joint in the neck. // So we need an extra offset. Which we determine by solving @@ -520,7 +562,7 @@ void neuralnet_tracker::draw_gizmos( draw_coord_line(2, {255, 0, 0}); // Draw the computed joint position - auto xy = world_to_image(pose.t); + auto xy = world_to_image(pose.t, frame.size(), intrinsics); cv::circle(frame, cv::Point(xy[0],xy[1]), 5, cv::Scalar(0,0,255), -1); } @@ -658,26 +700,6 @@ void neuralnet_tracker::set_intrinsics() } -vec3 neuralnet_tracker::image_to_world(float x, float y, float size, float real_size) const -{ - // Compute the location the network outputs in 3d space. - const float xpos = -(intrinsics.focal_length_w * frame.cols * 0.5f) / size * real_size; - const float zpos = (x / frame.cols * 2.f - 1.f) * xpos / intrinsics.focal_length_w; - const float ypos = (y / frame.rows * 2.f - 1.f) * xpos / intrinsics.focal_length_h; - return {xpos, ypos, zpos}; -} - - -vec2 neuralnet_tracker::world_to_image(const vec3& pos) const -{ - const float xscr = pos[2] / pos[0] * intrinsics.focal_length_w; - const float yscr = pos[1] / pos[0] * intrinsics.focal_length_h; - const float x = (xscr+1.)*0.5f*frame.cols; - const float y = (yscr+1.)*0.5f*frame.rows; - return {x, y}; -} - - void neuralnet_tracker::run() { if (!open_camera()) diff --git a/tracker-neuralnet/ftnoir_tracker_neuralnet.h b/tracker-neuralnet/ftnoir_tracker_neuralnet.h index c74a7b1d..b6405d1a 100644 --- a/tracker-neuralnet/ftnoir_tracker_neuralnet.h +++ b/tracker-neuralnet/ftnoir_tracker_neuralnet.h @@ -131,6 +131,7 @@ class PoseEstimator Ort::Value{nullptr}, Ort::Value{nullptr}}; double last_inference_time = 0; + int64_t model_version = 0; }; @@ -160,8 +161,6 @@ private: void update_fps(double dt); Affine compute_pose(const PoseEstimator::Face &face) const; - numeric_types::vec3 image_to_world(float x, float y, float size, float real_size) const; - numeric_types::vec2 world_to_image(const numeric_types::vec3& p) const; settings s; std::optional<Localizer> localizer; |