summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorMichael Welter <michael@welter-4d.de>2022-02-26 22:20:40 +0100
committerMichael Welter <michael@welter-4d.de>2022-05-15 16:53:44 +0200
commit25af3db8ba560d86f78aa4bfd10039588b7bab82 (patch)
treed81085a298a66ef39dd9c2ba811d587119d5a501
parent4697a0e44536ae9dc169aab54df26e5dbebdec7c (diff)
tracker/nn: Support new coordinate conventions for head rotation
-rw-r--r--tracker-neuralnet/ftnoir_tracker_neuralnet.cpp94
-rw-r--r--tracker-neuralnet/ftnoir_tracker_neuralnet.h3
2 files changed, 59 insertions, 38 deletions
diff --git a/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp b/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp
index 320c8f23..06874e6c 100644
--- a/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp
+++ b/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp
@@ -34,12 +34,14 @@
// https://github.com/microsoft/onnxruntime/blob/master/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/C_Api_Sample.cpp
// https://github.com/leimao/ONNX-Runtime-Inference/blob/main/src/inference.cpp
-namespace
+namespace neuralnet_tracker_ns
{
+
using numeric_types::vec3;
using numeric_types::vec2;
using numeric_types::mat33;
+using quat = std::array<numeric_types::f,4>;
// Minimal difference if at all going from 1 to 2 threads.
static constexpr int num_threads = 1;
@@ -136,6 +138,42 @@ mat33 quaternion_to_mat33(const std::array<float,4> quat)
}
+vec3 image_to_world(float x, float y, float size, float reference_size_in_mm, const cv::Size2i& image_size, const CamIntrinsics& intrinsics)
+{
+ // Compute the location the network outputs in 3d space.
+ const float xpos = -(intrinsics.focal_length_w * image_size.width * 0.5f) / size * reference_size_in_mm;
+ const float zpos = (x / image_size.width * 2.f - 1.f) * xpos / intrinsics.focal_length_w;
+ const float ypos = (y / image_size.height * 2.f - 1.f) * xpos / intrinsics.focal_length_h;
+ return {xpos, ypos, zpos};
+}
+
+
+vec2 world_to_image(const vec3& pos, const cv::Size2i& image_size, const CamIntrinsics& intrinsics)
+{
+ const float xscr = pos[2] / pos[0] * intrinsics.focal_length_w;
+ const float yscr = pos[1] / pos[0] * intrinsics.focal_length_h;
+ const float x = (xscr+1.)*0.5f*image_size.width;
+ const float y = (yscr+1.)*0.5f*image_size.height;
+ return {x, y};
+}
+
+
+quat image_to_world(quat q)
+{
+ std::swap(q[1], q[3]);
+ q[1] = -q[1];
+ q[2] = -q[2];
+ q[3] = -q[3];
+ return q;
+}
+
+quat world_to_image(quat q)
+{
+ // It's its own inverse.
+ return image_to_world(q);
+}
+
+
template<class T>
T iou(const cv::Rect_<T> &a, const cv::Rect_<T> &b)
{
@@ -158,13 +196,6 @@ cv::Size get_input_image_shape(const Ort::Session &session)
}
-} // namespace
-
-
-namespace neuralnet_tracker_ns
-{
-
-
int enum_to_fps(int value)
{
switch (value)
@@ -235,8 +266,9 @@ double Localizer::last_inference_time_millis() const
}
-PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&_session) :
- session{std::move(_session)}
+PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&_session)
+ : session{std::move(_session)}
+ , model_version(session.GetModelMetadata().GetVersion())
{
const cv::Size input_image_shape = get_input_image_shape(session);
@@ -296,7 +328,7 @@ std::optional<PoseEstimator::Face> PoseEstimator::run(
const cv::Mat &frame, const cv::Rect &box)
{
cv::Mat cropped;
-
+
const int patch_size = std::max(box.width, box.height)*1.05;
const cv::Point2f patch_center = {
std::clamp<float>(box.x + 0.5f*box.width, 0.f, frame.cols),
@@ -308,7 +340,7 @@ std::optional<PoseEstimator::Face> PoseEstimator::run(
// Have to catch this case.
if (cropped.rows != patch_size || cropped.cols != patch_size)
return {};
-
+
auto p = input_mat.ptr(0);
cv::resize(cropped, scaled_frame, scaled_frame.size(), 0, 0, cv::INTER_AREA);
@@ -359,12 +391,18 @@ std::optional<PoseEstimator::Face> PoseEstimator::run(
const float size = patch_size*0.5f*output_coord[2];
// Following Eigen which uses quat components in the order w, x, y, z.
- const std::array<float,4> rotation = {
+ quat rotation = {
output_quat[3],
output_quat[0],
output_quat[1],
output_quat[2] };
+ if (model_version < 2)
+ {
+ // Due to a change in coordinate conventions
+ rotation = world_to_image(rotation);
+ }
+
const cv::Rect2f outbox = {
patch_center.x + (0.5f*patch_size)*output_box[0],
patch_center.y + (0.5f*patch_size)*output_box[1],
@@ -453,7 +491,8 @@ Affine neuralnet_tracker::compute_pose(const PoseEstimator::Face &face) const
normalize(face.center, frame.rows, frame.cols),
intrinsics.focal_length_w);
- const mat33 m = rot_correction * quaternion_to_mat33(face.rotation);
+ const mat33 m = rot_correction * quaternion_to_mat33(
+ image_to_world(face.rotation));
/*
@@ -469,7 +508,10 @@ Affine neuralnet_tracker::compute_pose(const PoseEstimator::Face &face) const
*/
// Compute the location the network outputs in 3d space.
- const vec3 face_world_pos = image_to_world(face.center.x, face.center.y, face.size, head_size_mm);
+ const vec3 face_world_pos = image_to_world(
+ face.center.x, face.center.y, face.size, head_size_mm,
+ frame.size(),
+ intrinsics);
// But this is in general not the location of the rotation joint in the neck.
// So we need an extra offset. Which we determine by solving
@@ -520,7 +562,7 @@ void neuralnet_tracker::draw_gizmos(
draw_coord_line(2, {255, 0, 0});
// Draw the computed joint position
- auto xy = world_to_image(pose.t);
+ auto xy = world_to_image(pose.t, frame.size(), intrinsics);
cv::circle(frame, cv::Point(xy[0],xy[1]), 5, cv::Scalar(0,0,255), -1);
}
@@ -658,26 +700,6 @@ void neuralnet_tracker::set_intrinsics()
}
-vec3 neuralnet_tracker::image_to_world(float x, float y, float size, float real_size) const
-{
- // Compute the location the network outputs in 3d space.
- const float xpos = -(intrinsics.focal_length_w * frame.cols * 0.5f) / size * real_size;
- const float zpos = (x / frame.cols * 2.f - 1.f) * xpos / intrinsics.focal_length_w;
- const float ypos = (y / frame.rows * 2.f - 1.f) * xpos / intrinsics.focal_length_h;
- return {xpos, ypos, zpos};
-}
-
-
-vec2 neuralnet_tracker::world_to_image(const vec3& pos) const
-{
- const float xscr = pos[2] / pos[0] * intrinsics.focal_length_w;
- const float yscr = pos[1] / pos[0] * intrinsics.focal_length_h;
- const float x = (xscr+1.)*0.5f*frame.cols;
- const float y = (yscr+1.)*0.5f*frame.rows;
- return {x, y};
-}
-
-
void neuralnet_tracker::run()
{
if (!open_camera())
diff --git a/tracker-neuralnet/ftnoir_tracker_neuralnet.h b/tracker-neuralnet/ftnoir_tracker_neuralnet.h
index c74a7b1d..b6405d1a 100644
--- a/tracker-neuralnet/ftnoir_tracker_neuralnet.h
+++ b/tracker-neuralnet/ftnoir_tracker_neuralnet.h
@@ -131,6 +131,7 @@ class PoseEstimator
Ort::Value{nullptr},
Ort::Value{nullptr}};
double last_inference_time = 0;
+ int64_t model_version = 0;
};
@@ -160,8 +161,6 @@ private:
void update_fps(double dt);
Affine compute_pose(const PoseEstimator::Face &face) const;
- numeric_types::vec3 image_to_world(float x, float y, float size, float real_size) const;
- numeric_types::vec2 world_to_image(const numeric_types::vec3& p) const;
settings s;
std::optional<Localizer> localizer;