summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorMichael Welter <michael@welter-4d.de>2022-09-11 20:55:26 +0200
committerStanislaw Halik <sthalik@misaki.pl>2022-11-01 13:51:35 +0100
commit08f1fcad1c74e25f97641a0ccbd229b267ec528c (patch)
tree000b1b276bc7df4a74fd493dab05bcce68801de8
parent77d6abaf53dbe2ee6334bd59b112e25d694a2f65 (diff)
tracker/nn: Tweaks, refactoring, a deadzone filtering and support for uncertainty estimation
* Add rudimentary test for two functions .. maybe more in future * Fix the rotation correction from vertical translation * Move preview class to new files * Move neural network model adapters to new files * Add utility functions for opencv * Query the model inputs/outputs by name to see what is available * Supports outputs for standard deviation of the data distribution - What you get if you let your model output the full parameters of a gaussian distribution (depending on the inputs) and fit it with negative log likelihood loss. * Disabled support for sequence models * Add support for detection of eye open/close classification. Scale uncertainty estimate up if eyes closed * Add a deadzone filter which activates if the model supports uncertainty quantification. The deadzone scales becomes larger the more uncertain the model/data are. This is mostly supposed to be useful to suppress large estimate errors when the user blinks with the eyes * Fix distance being twice of what it should have been
-rw-r--r--tracker-neuralnet/deadzone_filter.cpp162
-rw-r--r--tracker-neuralnet/deadzone_filter.h37
-rw-r--r--tracker-neuralnet/ftnoir_tracker_neuralnet.cpp822
-rw-r--r--tracker-neuralnet/ftnoir_tracker_neuralnet.h127
-rw-r--r--tracker-neuralnet/model_adapters.cpp433
-rw-r--r--tracker-neuralnet/model_adapters.h102
-rw-r--r--tracker-neuralnet/opencv_contrib.h120
-rw-r--r--tracker-neuralnet/preview.cpp135
-rw-r--r--tracker-neuralnet/preview.h60
-rw-r--r--tracker-neuralnet/tests.cpp58
-rw-r--r--tracker-neuralnet/unscented_trafo.h132
11 files changed, 1441 insertions, 747 deletions
diff --git a/tracker-neuralnet/deadzone_filter.cpp b/tracker-neuralnet/deadzone_filter.cpp
new file mode 100644
index 00000000..b41afdba
--- /dev/null
+++ b/tracker-neuralnet/deadzone_filter.cpp
@@ -0,0 +1,162 @@
+#include "deadzone_filter.h"
+#include "model_adapters.h"
+#include "opencv_contrib.h"
+#include "unscented_trafo.h"
+
+#include <opencv2/core/base.hpp>
+#include <opencv2/core/matx.hpp>
+#include <opencv2/core/quaternion.hpp>
+
+namespace neuralnet_tracker_ns
+{
+
+using namespace cvcontrib;
+
+using StateVec = cv::Vec<float,6>;
+using StateCov = cv::Matx<float,6,6>;
+
+static constexpr int num_sigmas = ukf_cv::MerweScaledSigmaPoints<6>::num_sigmas;
+static constexpr float img_scale = 200.f;
+static constexpr float world_scale = 1000.f; // mm
+
+StateCov make_tangent_space_uncertainty_tril(const PoseEstimator::Face &face)
+{
+ StateCov tril = StateCov::eye();
+ tril(0,0) = face.center_stddev.x/img_scale;
+ tril(1,1) = face.center_stddev.y/img_scale;
+ tril(2,2) = face.size_stddev/img_scale;
+ set_minor<3,3>(tril, 3, 3, face.rotaxis_cov_tril);
+ return tril;
+}
+
+
+QuatPose apply_offset(const QuatPose& pose, const StateVec& offset)
+{
+ // Unpack
+ const cv::Vec3f dp = { offset[0], offset[1], offset[2] };
+ const cv::Quatf dr = cv::Quatf::createFromRvec(cv::Vec3f{ offset[3], offset[4], offset[5] });
+ const auto p = pose.pos + dp;
+ const auto r = pose.rot * dr;
+ return { r, p };
+}
+
+
+PoseEstimator::Face apply_offset(const PoseEstimator::Face& face, const StateVec& offset)
+{
+ const cv::Quatf dr = cv::Quatf::createFromRvec(cv::Vec3f{ offset[3], offset[4], offset[5] });
+ const auto r = face.rotation * dr;
+
+ const cv::Point2f p = {
+ face.center.x + offset[0]*img_scale,
+ face.center.y + offset[1]*img_scale
+ };
+
+ // Intercept the case where the head size stddev became so large that the sigma points
+ // were created with negative head size (mean - constant*stddev ...). Negative head size
+ // is bad. But this is fine. The unscented transform where this function comes into play
+ // is designed to handle non-linearities like this.
+ const float sz = std::max(0.1f*face.size, face.size + offset[2]*img_scale);
+
+ return PoseEstimator::Face{
+ r,
+ {},
+ {},
+ p,
+ {},
+ sz,
+ {}
+ };
+}
+
+
+StateVec relative_to(const QuatPose& reference, const QuatPose& pose)
+{
+ const auto p = pose.pos - reference.pos;
+ const auto r = toRotVec(reference.rot.conjugate()*pose.rot);
+ return StateVec{ p[0], p[1], p[2], r[0], r[1], r[2] };
+}
+
+
+ukf_cv::SigmaPoints<6> relative_to(const QuatPose& pose, const std::array<QuatPose,num_sigmas>& sigmas)
+{
+ ukf_cv::SigmaPoints<6> out;
+ std::transform(sigmas.begin(), sigmas.end(), out.begin(), [&pose](const QuatPose& s) {
+ return relative_to(pose, s);
+ });
+ return out;
+}
+
+
+std::array<QuatPose,num_sigmas> compute_world_pose_from_sigma_point(const PoseEstimator::Face& face, const ukf_cv::SigmaPoints<6>& sigmas, Face2WorldFunction face2world)
+{
+ std::array<QuatPose,num_sigmas> out;
+ std::transform(sigmas.begin(), sigmas.end(), out.begin(), [face2world=std::move(face2world), &face](const StateVec& sigma_point) {
+ // First unpack the state vector and generate quaternion rotation w.r.t image space.
+ const auto sigma_face = apply_offset(face, sigma_point);
+ // Then transform ...
+ QuatPose pose = face2world(sigma_face.rotation, sigma_face.center, sigma_face.size);
+ pose.pos /= world_scale;
+ return pose;
+ });
+ return out;
+}
+
+
+StateVec apply_filter_to_offset(const StateVec& offset, const StateCov& offset_cov, float, const FiltParams& params)
+{
+ // Offset and Cov represent a multivariate normal distribution, which is the probability of the new pose measured w.r.t the previous one.
+ // Prob(x) ~exp(-(x-mu)t Cov^-1 (x-mu))
+ // We want to attenuate this offset, or zero it out completely, to obtain a deadzone-filter behaviour. The size of the deadzone shall be
+ // determined by the covariance projected to the offset direction like so:
+ // Take x = mu - mu / |mu| * alpha
+ // p(alpha) ~exp(-alpha^2 / |mu|^2 * mut Cov^-1 mu) = ~exp(-alpha^2 / sigma^2) with sigma^2 = mut Cov^-1 mu / |mu|^2.
+ // So this projection is like a 1d normal distribution with some standard deviation, which we take to scale the deadzone.
+
+ bool ok = true;
+
+ const float len_div_sigma_sqr = offset.dot(offset_cov.inv(cv::DECOMP_CHOLESKY, &ok) * offset);
+
+ const float attenuation = (ok) ? sigmoid((std::sqrt(len_div_sigma_sqr) - params.deadzone_size)*params.deadzone_hardness) : 1.f;
+
+ // {
+ // std::cout << "cov diag: " << offset_cov.diag() << std::endl;
+ // std::cout << "offset: " << cv::norm(offset) << std::endl;
+ // std::cout << "len_div_sigma_sqr: " << cv::norm(len_div_sigma_sqr) << std::endl;
+ // std::cout << "attenuation (" << ok << "): " << attenuation << std::endl;
+ // }
+
+ return offset*attenuation;
+}
+
+
+QuatPose apply_filter(const PoseEstimator::Face &face, const QuatPose& previous_pose_, float dt, Face2WorldFunction face2world, const FiltParams& params)
+{
+ ukf_cv::MerweScaledSigmaPoints<6> unscentedtrafo;
+ auto previous_pose = previous_pose_;
+ previous_pose.pos /= world_scale;
+
+ // Here we have the covariance matrix for the offset from the observed values in `face`.
+ const auto cov_tril = make_tangent_space_uncertainty_tril(face);
+
+ // The filter uses an unscented transform to translate that into a distribution for the offset from the previous pose.
+ const ukf_cv::SigmaPoints<6> sigmas = unscentedtrafo.compute_sigmas(to_vec(StateVec::zeros()), cov_tril, true);
+
+ // We have many of these sigma points. This is why that callback comes into play here.
+ const std::array<QuatPose,num_sigmas> pose_sigmas = compute_world_pose_from_sigma_point(face, sigmas, std::move(face2world));
+
+ const ukf_cv::SigmaPoints<6> deltas_sigmas = relative_to(previous_pose, pose_sigmas);
+
+ const auto [offset, offset_cov] = unscentedtrafo.compute_statistics(deltas_sigmas);
+
+ // Then the deadzone is applied to the offset and finally the previous pose is transformed by the offset to arrive at the final output.
+ const StateVec scaled_offset = apply_filter_to_offset(offset, offset_cov, dt, params);
+
+ QuatPose new_pose = apply_offset(previous_pose, scaled_offset);
+
+ new_pose.pos *= world_scale;
+
+ return new_pose;
+}
+
+
+} // namespace neuralnet_tracker_ns \ No newline at end of file
diff --git a/tracker-neuralnet/deadzone_filter.h b/tracker-neuralnet/deadzone_filter.h
new file mode 100644
index 00000000..a9b6aada
--- /dev/null
+++ b/tracker-neuralnet/deadzone_filter.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include "unscented_trafo.h"
+#include "opencv_contrib.h"
+#include "model_adapters.h"
+
+namespace neuralnet_tracker_ns
+{
+
+/// Represents a 6d pose by quaternion rotation and position vector.
+struct QuatPose {
+ cv::Quatf rot;
+ cv::Vec3f pos;
+};
+
+struct FiltParams
+{
+ float deadzone_hardness = 1.f;
+ float deadzone_size = 1.f;
+};
+
+/** Callback type for converting data from the `Face` struct to a 6d pose.
+*
+* This callback is needed because it depends on things that the filter doesn't have to know about and it is called multiple times
+* due to the way how uncertainty estimates are handled
+*/
+using Face2WorldFunction = std::function<QuatPose (const cv::Quatf&, const cv::Point2f&, float)>;
+
+/** Applies a deadzone filter similar to the one used in the Hamilton filter.
+*
+* What sets this apart is that the deadzone size scales with the uncertainty estimate of the network.
+* The rotation uncertainty is represented by a covariance matrix for the distribution of a rotation vector which
+* describes the offset from the mean rotation (the quaternion in the `Face` struct).
+*/
+QuatPose apply_filter(const PoseEstimator::Face &face, const QuatPose& previous_pose, float dt, Face2WorldFunction face2world, const FiltParams& params);
+
+} // namespace neuralnet_tracker_ns \ No newline at end of file
diff --git a/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp b/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp
index 62209978..b56b4e96 100644
--- a/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp
+++ b/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp
@@ -6,17 +6,19 @@
*/
#include "ftnoir_tracker_neuralnet.h"
+#include "deadzone_filter.h"
+#include "opencv_contrib.h"
+
#include "compat/sleep.hpp"
#include "compat/math-imports.hpp"
-#include "cv/init.hpp"
-#include <opencv2/core.hpp>
-#include <opencv2/core/hal/interface.h>
-#include <opencv2/core/types.hpp>
-#include <opencv2/calib3d.hpp>
-#include <opencv2/imgcodecs.hpp>
#include "compat/timer.hpp"
#include "compat/check-visible.hpp"
+#include "cv/init.hpp"
+
#include <omp.h>
+#include <onnxruntime_cxx_api.h>
+#include <opencv2/core.hpp>
+#include <opencv2/core/quaternion.hpp>
#ifdef _MSC_VER
# pragma warning(disable : 4702)
@@ -32,7 +34,7 @@
#include <chrono>
#include <string>
#include <stdexcept>
-
+#include <unordered_map>
// Some demo code for onnx
// https://github.com/microsoft/onnxruntime/blob/master/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/C_Api_Sample.cpp
@@ -41,12 +43,11 @@
namespace neuralnet_tracker_ns
{
+using namespace cvcontrib;
using numeric_types::vec3;
using numeric_types::vec2;
using numeric_types::mat33;
-using quat = std::array<numeric_types::f,4>;
-
#if _MSC_VER
std::wstring convert(const QString &s) { return s.toStdWString(); }
@@ -55,6 +56,18 @@ std::string convert(const QString &s) { return s.toStdString(); }
#endif
+int enum_to_fps(int value)
+{
+ switch (value)
+ {
+ case fps_30: return 30;
+ case fps_60: return 60;
+ default: [[fallthrough]];
+ case fps_default: return 0;
+ }
+}
+
+
template<class F>
struct OnScopeExit
{
@@ -67,28 +80,38 @@ struct OnScopeExit
};
-float sigmoid(float x)
+CamIntrinsics make_intrinsics(const cv::Mat& img, const Settings& settings)
{
- return 1.f/(1.f + std::exp(-x));
-}
-
+ const int w = img.cols, h = img.rows;
+ const double diag_fov = settings.fov * M_PI / 180.;
+ const double fov_w = 2.*atan(tan(diag_fov/2.)/sqrt(1. + h/(double)w * h/(double)w));
+ const double fov_h = 2.*atan(tan(diag_fov/2.)/sqrt(1. + w/(double)h * w/(double)h));
+ const double focal_length_w = 1. / tan(.5 * fov_w);
+ const double focal_length_h = 1. / tan(.5 * fov_h);
+ /* a
+ ______ <--- here is sensor area
+ | /
+ | /
+ f | /
+ | / 2 x angle is the fov
+ |/
+ <--- here is the hole of the pinhole camera
+
+ So, a / f = tan(fov / 2)
+ => f = a/tan(fov/2)
+ What is a?
+ 1 if we define f in terms of clip space where the image plane goes from -1 to 1. Because a is the half-width.
+ */
-cv::Rect make_crop_rect_for_aspect(const cv::Size &size, int aspect_w, int aspect_h)
-{
- auto [w, h] = size;
- if ( w*aspect_h > aspect_w*h )
- {
- // Image is too wide
- const int new_w = (aspect_w*h)/aspect_h;
- return cv::Rect((w - new_w)/2, 0, new_w, h);
- }
- else
- {
- const int new_h = (aspect_h*w)/aspect_w;
- return cv::Rect(0, (h - new_h)/2, w, new_h);
- }
+ return {
+ (float)focal_length_w,
+ (float)focal_length_h,
+ (float)fov_w,
+ (float)fov_h
+ };
}
+
cv::Rect make_crop_rect_multiple_of(const cv::Size &size, int multiple)
{
const int new_w = (size.width / multiple) * multiple;
@@ -111,20 +134,6 @@ cv::Rect_<T> squarize(const cv::Rect_<T> &r)
template<class T>
-cv::Point_<T> as_point(const cv::Size_<T>& s)
-{
- return { s.width, s.height };
-}
-
-
-template<class T>
-cv::Size_<T> as_size(const cv::Point_<T>& p)
-{
- return { p.x, p.y };
-}
-
-
-template<class T>
cv::Rect_<T> expand(const cv::Rect_<T>& r, T factor)
{
// xnew = l+.5*w - w*f*0.5 = l + .5*(w - new_w)
@@ -145,110 +154,40 @@ cv::Rect_<T> ewa_filter(const cv::Rect_<T>& last, const cv::Rect_<T>& current, T
}
-cv::Rect2f unnormalize(const cv::Rect2f &r, int h, int w)
-{
- auto unnorm = [](float x) -> float { return 0.5*(x+1); };
- auto tl = r.tl();
- auto br = r.br();
- auto x0 = unnorm(tl.x)*w;
- auto y0 = unnorm(tl.y)*h;
- auto x1 = unnorm(br.x)*w;
- auto y1 = unnorm(br.y)*h;
- return {
- x0, y0, x1-x0, y1-y0
- };
-}
-
-cv::Point2f normalize(const cv::Point2f &p, int h, int w)
-{
- return {
- p.x/w*2.f-1.f,
- p.y/h*2.f-1.f
- };
-}
-
-
-mat33 rotation_from_two_vectors(const vec3 &a, const vec3 &b)
-{
- vec3 axis = a.cross(b);
- const float len_a = cv::norm(a);
- const float len_b = cv::norm(b);
- const float len_axis = cv::norm(axis);
- const float sin_angle = std::clamp(len_axis / (len_a * len_b), -1.f, 1.f);
- const float angle = std::asin(sin_angle);
- axis *= angle/(1.e-12 + len_axis);
- mat33 out;
- cv::Rodrigues(axis, out);
- return out;
-}
-
-
-// Computes correction due to head being off screen center.
-// x, y: In screen space, i.e. in [-1,1]
-// focal_length_x: In screen space
-mat33 compute_rotation_correction(const cv::Point2f &p, float focal_length_x)
-{
- return rotation_from_two_vectors(
- {1.f,0.f,0.f},
- {focal_length_x, p.y, p.x});
-}
-
-
-mat33 quaternion_to_mat33(const std::array<float,4> quat)
-{
- mat33 m;
- const float w = quat[0];
- const float i = quat[1];
- const float j = quat[2];
- const float k = quat[3];
- m(0,0) = 1.f - 2.f*(j*j + k*k);
- m(1,0) = 2.f*(i*j + k*w);
- m(2,0) = 2.f*(i*k - j*w);
- m(0,1) = 2.f*(i*j - k*w);
- m(1,1) = 1.f - 2.f*(i*i + k*k);
- m(2,1) = 2.f*(j*k + i*w);
- m(0,2) = 2.f*(i*k + j*w);
- m(1,2) = 2.f*(j*k - i*w);
- m(2,2) = 1.f - 2.f*(i*i + j*j);
- return m;
-}
-
-
-vec3 rotate_vec(const quat& q, const vec3& p)
+cv::Vec3f image_to_world(float x, float y, float size, float reference_size_in_mm, const cv::Size2i& image_size, const CamIntrinsics& intrinsics)
{
- const float qw = q[0];
- const float qi = q[1];
- const float qj = q[2];
- const float qk = q[3];
- const float pi = p[0];
- const float pj = p[1];
- const float pk = p[2];
- const quat tmp{
- - qi*pi - qj*pj - qk*pk,
- qw*pi + qj*pk - qk*pj,
- qw*pj - qi*pk + qk*pi,
- qw*pk + qi*pj - qj*pi
- };
- const vec3 out {
- -tmp[0]*qi + tmp[1]*qw - tmp[2]*qk + tmp[3]*qj,
- -tmp[0]*qj + tmp[1]*qk + tmp[2]*qw - tmp[3]*qi,
- -tmp[0]*qk - tmp[1]*qj + tmp[2]*qi + tmp[3]*qw
- };
- return out;
-}
-
-
-vec3 image_to_world(float x, float y, float size, float reference_size_in_mm, const cv::Size2i& image_size, const CamIntrinsics& intrinsics)
-{
- // Compute the location the network outputs in 3d space.
- const float xpos = -(intrinsics.focal_length_w * image_size.width * 0.5f) / size * reference_size_in_mm;
+ /*
+ Compute the location the network outputs in 3d space.
+
+ hhhhhh <- head size (meters)
+ \ | -----------------------
+ \ | \
+ \ | |
+ \ | |- x (meters)
+ ____ <- face.size / width |
+ \ | | |
+ \| |- focal length /
+ ------------------------
+ ------------------------------------------------>> z direction
+ z/x = zi / f
+ zi = image position
+ z = world position
+ f = focal length
+
+ We can also do deltas:
+ dz / x = dzi / f
+ => x = dz / dzi * f
+ which means we can compute x from the head size (dzi) if we assume some reference size (dz).
+ */
+ const float head_size_vertical = 2.f*size; // Size from the model is more like half the real vertical size of a human head.
+ const float xpos = -(intrinsics.focal_length_w * image_size.width * 0.5f) / head_size_vertical * reference_size_in_mm;
const float zpos = (x / image_size.width * 2.f - 1.f) * xpos / intrinsics.focal_length_w;
const float ypos = (y / image_size.height * 2.f - 1.f) * xpos / intrinsics.focal_length_h;
return {xpos, ypos, zpos};
}
-vec2 world_to_image(const vec3& pos, const cv::Size2i& image_size, const CamIntrinsics& intrinsics)
+vec2 world_to_image(const cv::Vec3f& pos, const cv::Size2i& image_size, const CamIntrinsics& intrinsics)
{
const float xscr = pos[2] / pos[0] * intrinsics.focal_length_w;
const float yscr = pos[1] / pos[0] * intrinsics.focal_length_h;
@@ -258,7 +197,7 @@ vec2 world_to_image(const vec3& pos, const cv::Size2i& image_size, const CamIntr
}
-quat image_to_world(quat q)
+cv::Quatf image_to_world(cv::Quatf q)
{
std::swap(q[1], q[3]);
q[1] = -q[1];
@@ -267,338 +206,73 @@ quat image_to_world(quat q)
return q;
}
-quat world_to_image(quat q)
-{
- // It's its own inverse.
- return image_to_world(q);
-}
-
-// Intersection over union. A value between 0 and 1 which measures the match between the bounding boxes.
-template<class T>
-T iou(const cv::Rect_<T> &a, const cv::Rect_<T> &b)
-{
- auto i = a & b;
- return double{i.area()} / (a.area()+b.area()-i.area());
-}
-
-// Returns width and height of the input tensor, or throws.
-// Expects the model to take one tensor as input that must
-// have the shape B x C x H x W, where B=C=1.
-cv::Size get_input_image_shape(const Ort::Session &session)
-{
- if (session.GetInputCount() < 1)
- throw std::invalid_argument("Model must take at least one input tensor");
- const std::vector<std::int64_t> shape =
- session.GetInputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape();
- if (shape.size() != 4)
- throw std::invalid_argument("Model takes the input tensor in the wrong shape");
- return { static_cast<int>(shape[3]), static_cast<int>(shape[2]) };
-}
-
-Ort::Value create_tensor(const Ort::TypeInfo& info, Ort::Allocator& alloc)
+cv::Point2f normalize(const cv::Point2f &p, int h, int w)
{
- const auto shape = info.GetTensorTypeAndShapeInfo().GetShape();
- auto t = Ort::Value::CreateTensor<float>(
- alloc, shape.data(), shape.size());
- memset(t.GetTensorMutableData<float>(), 0, sizeof(float)*info.GetTensorTypeAndShapeInfo().GetElementCount());
- return t;
+ return {
+ p.x/w*2.f-1.f,
+ p.y/h*2.f-1.f
+ };
}
-
-int enum_to_fps(int value)
+cv::Quatf rotation_from_two_vectors(const vec3 &a, const vec3 &b)
{
- switch (value)
+ // |axis| = |a| * |b| * sin(alpha)
+ const vec3 axis = a.cross(b);
+ // dot = |a|*|b|*cos(alpha)
+ const float dot = a.dot(b);
+ const float len = cv::norm(axis);
+ vec3 normed_axis = axis / len;
+ float angle = std::atan2(len, dot);
+ if (!(std::isfinite(normed_axis[0]) && std::isfinite(normed_axis[1]) && std::isfinite(normed_axis[2])))
{
- case fps_30: return 30;
- case fps_60: return 60;
- default: [[fallthrough]];
- case fps_default: return 0;
+ angle = 0.f;
+ normed_axis = vec3{1.,0.,0.};
}
+ return cv::Quatf::createFromAngleAxis(angle, normed_axis);
}
-Localizer::Localizer(Ort::MemoryInfo &allocator_info, Ort::Session &&session) :
- session_{std::move(session)},
- scaled_frame_(INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH, CV_8U),
- input_mat_(INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH, CV_32F)
-{
- // Only works when input_mat does not reallocated memory ...which it should not.
- // Non-owning memory reference to input_mat?
- // Note: shape = (bach x channels x h x w)
- const std::int64_t input_shape[4] = { 1, 1, INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH };
- input_val_ = Ort::Value::CreateTensor<float>(allocator_info, input_mat_.ptr<float>(0), input_mat_.total(), input_shape, 4);
-
- const std::int64_t output_shape[2] = { 1, 5 };
- output_val_ = Ort::Value::CreateTensor<float>(allocator_info, results_.data(), results_.size(), output_shape, 2);
-}
-
-
-std::pair<float, cv::Rect2f> Localizer::run(
- const cv::Mat &frame)
-{
- auto p = input_mat_.ptr(0);
-
- cv::resize(frame, scaled_frame_, { INPUT_IMG_WIDTH, INPUT_IMG_HEIGHT }, 0, 0, cv::INTER_AREA);
- scaled_frame_.convertTo(input_mat_, CV_32F, 1./255., -0.5);
-
- assert (input_mat_.ptr(0) == p);
- assert (!input_mat_.empty() && input_mat_.isContinuous());
- assert (input_mat_.cols == INPUT_IMG_WIDTH && input_mat_.rows == INPUT_IMG_HEIGHT);
-
- const char* input_names[] = {"x"};
- const char* output_names[] = {"logit_box"};
-
- Timer t; t.start();
-
- session_.Run(Ort::RunOptions{nullptr}, input_names, &input_val_, 1, output_names, &output_val_, 1);
-
- last_inference_time_ = t.elapsed_ms();
-
- const cv::Rect2f roi = unnormalize(cv::Rect2f{
- results_[1],
- results_[2],
- results_[3]-results_[1], // Width
- results_[4]-results_[2] // Height
- }, frame.rows, frame.cols);
- const float score = sigmoid(results_[0]);
-
- return { score, roi };
-}
-
-
-double Localizer::last_inference_time_millis() const
+// Computes correction due to head being off screen center.
+cv::Quatf compute_rotation_correction(const cv::Point3f& p)
{
- return last_inference_time_;
+ return rotation_from_two_vectors(
+ {-1.f,0.f,0.f}, p);
}
-PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&_session)
- : model_version_{_session.GetModelMetadata().GetVersion()}
- , session_{std::move(_session)}
- , allocator_{session_, allocator_info}
+// Intersection over union. A value between 0 and 1 which measures the match between the bounding boxes.
+template<class T>
+T iou(const cv::Rect_<T> &a, const cv::Rect_<T> &b)
{
- using namespace std::literals::string_literals;
-
- if (session_.GetOutputCount() < 2)
- throw std::runtime_error("Invalid Model: must have at least two outputs");
-
- // WARNING UB .. but still ...
- // If the model was saved without meta data, it seems the version field is uninitialized.
- // In that case reading from it is UB. However, we will just get same arbitrary number
- // which is hopefully different from the numbers used by models where the version is set.
- // I.e., this is what happended in practice so far.
- if (model_version_ != 2)
- model_version_ = 1;
-
- const cv::Size input_image_shape = get_input_image_shape(session_);
-
- scaled_frame_ = cv::Mat(input_image_shape, CV_8U, cv::Scalar(0));
- input_mat_ = cv::Mat(input_image_shape, CV_32F, cv::Scalar(0.f));
-
- {
- const std::int64_t input_shape[4] = { 1, 1, input_image_shape.height, input_image_shape.width };
- input_val_.push_back(
- Ort::Value::CreateTensor<float>(allocator_info, input_mat_.ptr<float>(0), input_mat_.total(), input_shape, 4));
- }
-
- {
- const std::int64_t output_shape[2] = { 1, 3 };
- output_val_.push_back(Ort::Value::CreateTensor<float>(
- allocator_info, &output_coord_[0], output_coord_.rows, output_shape, 2));
- }
-
- {
- const std::int64_t output_shape[2] = { 1, 4 };
- output_val_.push_back(Ort::Value::CreateTensor<float>(
- allocator_info, &output_quat_[0], output_quat_.rows, output_shape, 2));
- }
-
- size_t num_regular_outputs = 2;
-
- if (session_.GetOutputCount() >= 3 && "box"s == session_.GetOutputName(2, allocator_))
- {
- const std::int64_t output_shape[2] = { 1, 4 };
- output_val_.push_back(Ort::Value::CreateTensor<float>(
- allocator_info, &output_box_[0], output_box_.rows, output_shape, 2));
- ++num_regular_outputs;
- qDebug() << "Note: Legacy model output for face ROI is currently ignored";
- }
-
- num_recurrent_states_ = session_.GetInputCount()-1;
- if (session_.GetOutputCount()-num_regular_outputs != num_recurrent_states_)
- throw std::runtime_error("Invalid Model: After regular inputs and outputs the model must have equal number of inputs and outputs for tensors holding hidden states of recurrent layers.");
-
- // Create tensors for recurrent state
- for (size_t i = 0; i < num_recurrent_states_; ++i)
- {
- const auto& input_info = session_.GetInputTypeInfo(1+i);
- const auto& output_info = session_.GetOutputTypeInfo(num_regular_outputs+i);
- if (input_info.GetTensorTypeAndShapeInfo().GetShape() !=
- output_info.GetTensorTypeAndShapeInfo().GetShape())
- throw std::runtime_error("Invalid Model: Tensors for recurrent hidden states should have same shape on intput and output");
- input_val_.push_back(create_tensor(input_info, allocator_));
- output_val_.push_back(create_tensor(output_info, allocator_));
- }
-
- for (size_t i = 0; i < session_.GetInputCount(); ++i)
- {
- input_names_.push_back(session_.GetInputName(i, allocator_));
- }
- for (size_t i = 0; i < session_.GetOutputCount(); ++i)
- {
- output_names_.push_back(session_.GetOutputName(i, allocator_));
- }
-
- qDebug() << "Model inputs: " << session_.GetInputCount() << ", outputs: " << session_.GetOutputCount() << ", recurrent states: " << num_recurrent_states_;
-
- assert (input_names_.size() == input_val_.size());
- assert (output_names_.size() == output_val_.size());
+ auto i = a & b;
+ return double{i.area()} / (a.area()+b.area()-i.area());
}
-int PoseEstimator::find_input_intensity_90_pct_quantile() const
+class GuardedThreadCountSwitch
{
- const int channels[] = { 0 };
- const int hist_size[] = { 255 };
- float range[] = { 0, 256 };
- const float* ranges[] = { range };
- cv::Mat hist;
- cv::calcHist(&scaled_frame_, 1, channels, cv::Mat(), hist, 1, hist_size, ranges, true, false);
- int gray_level = 0;
- const int num_pixels_quantile = scaled_frame_.total()*0.9f;
- int num_pixels_accum = 0;
- for (int i=0; i<hist_size[0]; ++i)
- {
- num_pixels_accum += hist.at<float>(i);
- if (num_pixels_accum > num_pixels_quantile)
+ int old_num_threads_cv_ = 1;
+ int old_num_threads_omp_ = 1;
+ public:
+ GuardedThreadCountSwitch(int num_threads)
{
- gray_level = i;
- break;
+ old_num_threads_cv_ = cv::getNumThreads();
+ old_num_threads_omp_ = omp_get_num_threads();
+ omp_set_num_threads(num_threads);
+ cv::setNumThreads(num_threads);
+ }
+
+ ~GuardedThreadCountSwitch()
+ {
+ omp_set_num_threads(old_num_threads_omp_);
+ cv::setNumThreads(old_num_threads_cv_);
}
- }
- return gray_level;
-}
-
-
-std::optional<PoseEstimator::Face> PoseEstimator::run(
- const cv::Mat &frame, const cv::Rect &box)
-{
- cv::Mat cropped;
-
- const int patch_size = std::max(box.width, box.height)*1.05;
- const cv::Point2f patch_center = {
- std::clamp<float>(box.x + 0.5f*box.width, 0.f, frame.cols),
- std::clamp<float>(box.y + 0.5f*box.height, 0.f, frame.rows)
- };
- cv::getRectSubPix(frame, {patch_size, patch_size}, patch_center, cropped);
-
- // Will get failure if patch_center is outside image boundariesettings.
- // Have to catch this case.
- if (cropped.rows != patch_size || cropped.cols != patch_size)
- return {};
-
- auto p = input_mat_.ptr(0);
-
- cv::resize(cropped, scaled_frame_, scaled_frame_.size(), 0, 0, cv::INTER_AREA);
-
- // Automatic brightness amplification.
- const int brightness = find_input_intensity_90_pct_quantile();
- const double alpha = brightness<127 ? 0.5/std::max(5,brightness) : 1./255;
- const double beta = -0.5;
-
- scaled_frame_.convertTo(input_mat_, CV_32F, alpha, beta);
-
- assert (input_mat_.ptr(0) == p);
- assert (!input_mat_.empty() && input_mat_.isContinuous());
-
-
- Timer t; t.start();
-
- try
- {
- session_.Run(
- Ort::RunOptions{ nullptr },
- input_names_.data(),
- input_val_.data(),
- input_val_.size(),
- output_names_.data(),
- output_val_.data(),
- output_val_.size());
- }
- catch (const Ort::Exception &e)
- {
- qDebug() << "Failed to run the model: " << e.what();
- return {};
- }
-
- for (size_t i = 0; i<num_recurrent_states_; ++i)
- {
- // Next step, the current output becomes the input.
- // Thus we realize the recurrent connection.
- // Only swaps the internal pointers. There is no copy of data.
- std::swap(
- output_val_[output_val_.size()-num_recurrent_states_+i],
- input_val_[input_val_.size()-num_recurrent_states_+i]);
- }
-
- // FIXME: Execution time fluctuates wildly. 19 to 26 msec. Why?
- // The instructions are always the same. Maybe a memory allocation
- // issue. The ONNX api suggests that tensor are allocated in an
- // arena. Does that matter? Maybe the issue is something else?
-
- last_inference_time_ = t.elapsed_ms();
-
- // Perform coordinate transformation.
- // From patch-local normalized in [-1,1] to
- // frame unnormalized pixel coordinatesettings.
-
- const cv::Point2f center = patch_center +
- (0.5f*patch_size)*cv::Point2f{output_coord_[0], output_coord_[1]};
-
- const float size = patch_size*0.5f*output_coord_[2];
-
- // Following Eigen which uses quat components in the order w, x, y, z.
- quat rotation = {
- output_quat_[3],
- output_quat_[0],
- output_quat_[1],
- output_quat_[2] };
-
- if (model_version_ < 2)
- {
- // Due to a change in coordinate conventions
- rotation = world_to_image(rotation);
- }
-
- const cv::Rect2f outbox = {
- patch_center.x + (0.5f*patch_size)*output_box_[0],
- patch_center.y + (0.5f*patch_size)*output_box_[1],
- 0.5f*patch_size*(output_box_[2]-output_box_[0]),
- 0.5f*patch_size*(output_box_[3]-output_box_[1])
- };
-
- return std::optional<Face>({
- rotation, outbox, center, size
- });
-}
-
-
-cv::Mat PoseEstimator::last_network_input() const
-{
- assert(!input_mat_.empty());
- cv::Mat ret;
- input_mat_.convertTo(ret, CV_8U, 255., 127.);
- cv::cvtColor(ret, ret, cv::COLOR_GRAY2RGB);
- return ret;
-}
-
-double PoseEstimator::last_inference_time_millis() const
-{
- return last_inference_time_;
-}
+ GuardedThreadCountSwitch(const GuardedThreadCountSwitch&) = delete;
+ GuardedThreadCountSwitch& operator=(const GuardedThreadCountSwitch&) = delete;
+};
bool NeuralNetTracker::detect()
@@ -645,6 +319,8 @@ bool NeuralNetTracker::detect()
if (!last_roi_)
{
+ // Last iteration the tracker failed to generate a trustworthy
+ // roi and the localizer also cannot find a face.
draw_gizmos({}, {});
return false;
}
@@ -655,37 +331,28 @@ bool NeuralNetTracker::detect()
if (!face)
{
last_roi_.reset();
- draw_gizmos(*face, {});
+ draw_gizmos({}, {});
return false;
}
- {
- // Here: compute ROI from head size estimate. This helps make the size prediction more
- // invariant to mouth opening. The tracking can be lost more often at extreme
- // rotations, depending on the implementation details. The code down here has
- // been tweaked so that it works pretty well.
- // In old behaviour ROI is taken from the model outputs
- const vec3 offset = rotate_vec(face->rotation, vec3{0.f, 0.1f*face->size, face->size*0.3f});
- const float halfsize = face->size/float(settings_.roi_zoom);
- face->box = cv::Rect2f(
- face->center.x + offset[0] - halfsize,
- face->center.y + offset[1] - halfsize,
- halfsize*2.f,
- halfsize*2.f
- );
- }
-
- last_roi_ = ewa_filter(*last_roi_, face->box, float(settings_.roi_filter_alpha));
-
- Affine pose = compute_pose(*face);
+ cv::Rect2f roi = expand(face->box, (float)settings_.roi_zoom);
- draw_gizmos(*face, pose);
+ last_roi_ = ewa_filter(*last_roi_, roi, float(settings_.roi_filter_alpha));
+ QuatPose pose = compute_filtered_pose(*face);
+ last_pose_ = pose;
+
+ Affine pose_affine = {
+ pose.rot.toRotMat3x3(cv::QUAT_ASSUME_UNIT),
+ pose.pos };
+
{
QMutexLocker lck(&mtx_);
- this->pose_ = pose;
+ last_pose_affine_ = pose_affine;
}
+ draw_gizmos(*face, last_pose_affine_);
+
return true;
}
@@ -697,168 +364,74 @@ void NeuralNetTracker::draw_gizmos(
if (!is_visible_)
return;
- preview_.draw_gizmos(face, pose, last_roi_, last_localizer_roi_, world_to_image(pose.t, grayscale_.size(), intrinsics_));
+ preview_.draw_gizmos(
+ face,
+ last_roi_,
+ last_localizer_roi_,
+ world_to_image(pose.t, grayscale_.size(), intrinsics_));
if (settings_.show_network_input)
{
cv::Mat netinput = poseestimator_->last_network_input();
preview_.overlay_netinput(netinput);
}
-
- //preview_.draw_fps(fps, last_inference_time);
}
-Affine NeuralNetTracker::compute_pose(const PoseEstimator::Face &face) const
+QuatPose NeuralNetTracker::transform_to_world_pose(const cv::Quatf &face_rotation, const cv::Point2f& face_xy, const float face_size) const
{
- // Compute the location the network outputs in 3d space.
-
- const mat33 rot_correction = compute_rotation_correction(
- normalize(face.center, grayscale_.rows, grayscale_.cols),
- intrinsics_.focal_length_w);
-
- const mat33 m = rot_correction * quaternion_to_mat33(
- image_to_world(face.rotation));
-
- /*
-
- hhhhhh <- head size (meters)
- \ | -----------------------
- \ | \
- \ | |
- \ | |- tz (meters)
- ____ <- face.size / width |
- \ | | |
- \| |- focal length /
- ------------------------
- */
-
const vec3 face_world_pos = image_to_world(
- face.center.x, face.center.y, face.size, HEAD_SIZE_MM,
+ face_xy.x, face_xy.y, face_size, HEAD_SIZE_MM,
grayscale_.size(),
intrinsics_);
+ const cv::Quatf rot_correction = compute_rotation_correction(
+ face_world_pos);
+
+ cv::Quatf rot = rot_correction * image_to_world(face_rotation);
+
// But this is in general not the location of the rotation joint in the neck.
- // So we need an extra offset. Which we determine by solving
+ // So we need an extra offset. Which we determine by computing
// z,y,z-pos = head_joint_loc + R_face * offset
- const vec3 pos = face_world_pos
- + m * vec3{
- static_cast<float>(settings_.offset_fwd),
- static_cast<float>(settings_.offset_up),
- static_cast<float>(settings_.offset_right)};
+ const vec3 local_offset = vec3{
+ static_cast<float>(settings_.offset_fwd),
+ static_cast<float>(settings_.offset_up),
+ static_cast<float>(settings_.offset_right)};
+ const vec3 offset = rotate(rot, local_offset);
+ const vec3 pos = face_world_pos + offset;
- return { m, pos };
+ return { rot, pos };
}
-void Preview::init(const cv_video_widget& widget)
+QuatPose NeuralNetTracker::compute_filtered_pose(const PoseEstimator::Face &face)
{
- auto [w,h] = widget.preview_size();
- preview_size_ = { w, h };
-}
-
-
-void Preview::copy_video_frame(const cv::Mat& frame)
-{
- cv::Rect roi = make_crop_rect_for_aspect(frame.size(), preview_size_.width, preview_size_.height);
-
- cv::resize(frame(roi), preview_image_, preview_size_, 0, 0, cv::INTER_NEAREST);
-
- offset_ = { (float)-roi.x, (float)-roi.y };
- scale_ = float(preview_image_.cols) / float(roi.width);
-}
-
-
-void Preview::draw_gizmos(
- const std::optional<PoseEstimator::Face> &face,
- const Affine& pose,
- const std::optional<cv::Rect2f>& last_roi,
- const std::optional<cv::Rect2f>& last_localizer_roi,
- const cv::Point2f& neckjoint_position)
-{
- if (preview_image_.empty())
- return;
-
- if (last_roi)
+ if (fps_ > 0.01 && last_pose_ && poseestimator_->has_uncertainty())
{
- const int col = 255;
- cv::rectangle(preview_image_, transform(*last_roi), cv::Scalar(0, col, 0), /*thickness=*/1);
- }
- if (last_localizer_roi)
- {
- const int col = 255;
- cv::rectangle(preview_image_, transform(*last_localizer_roi), cv::Scalar(col, 0, 255-col), /*thickness=*/1);
+ auto image2world = [this](const cv::Quatf &face_rotation, const cv::Point2f& face_xy, const float face_size) {
+ return this->transform_to_world_pose(face_rotation, face_xy, face_size); };
+
+ return apply_filter(
+ face,
+ *last_pose_,
+ 1./fps_,
+ std::move(image2world),
+ FiltParams{
+ float(settings_.deadzone_hardness),
+ float(settings_.deadzone_size)
+ });
}
-
- if (face)
+ else
{
- if (face->size>=1.f)
- cv::circle(preview_image_, static_cast<cv::Point>(transform(face->center)), int(transform(face->size)), cv::Scalar(255,255,255), 2);
- cv::circle(preview_image_, static_cast<cv::Point>(transform(face->center)), 3, cv::Scalar(255,255,255), -1);
-
- auto draw_coord_line = [&](int i, const cv::Scalar& color)
- {
- const float vx = -pose.R(2,i);
- const float vy = -pose.R(1,i);
- static constexpr float len = 100.f;
- cv::Point q = face->center + len*cv::Point2f{vx, vy};
- cv::line(preview_image_, static_cast<cv::Point>(transform(face->center)), static_cast<cv::Point>(transform(q)), color, 2);
- };
- draw_coord_line(0, {0, 0, 255});
- draw_coord_line(1, {0, 255, 0});
- draw_coord_line(2, {255, 0, 0});
-
- // Draw the computed joint position
- auto xy = transform(neckjoint_position);
- cv::circle(preview_image_, cv::Point(xy.x,xy.y), 5, cv::Scalar(0,0,255), -1);
+ return transform_to_world_pose(face.rotation, face.center, face.size);
}
}
-void Preview::overlay_netinput(const cv::Mat& netinput)
-{
- if (netinput.empty())
- return;
-
- const int w = std::min(netinput.cols, preview_image_.cols);
- const int h = std::min(netinput.rows, preview_image_.rows);
- cv::Rect roi(0, 0, w, h);
- netinput(roi).copyTo(preview_image_(roi));
-}
-
-void Preview::draw_fps(double fps, double last_inference_time)
-{
- char buf[128];
- ::snprintf(buf, sizeof(buf), "%d Hz, pose inference: %d ms", std::clamp(int(fps), 0, 9999), int(last_inference_time));
- cv::putText(preview_image_, buf, cv::Point(10, preview_image_.rows-10), cv::FONT_HERSHEY_PLAIN, 1, cv::Scalar(0, 255, 0), 1);
-}
-
-
-void Preview::copy_to_widget(cv_video_widget& widget)
-{
- if (preview_image_.rows > 0)
- widget.update_image(preview_image_);
-}
-
-
-cv::Rect2f Preview::transform(const cv::Rect2f& r) const
-{
- return { (r.x - offset_.x)*scale_, (r.y - offset_.y)*scale_, r.width*scale_, r.height*scale_ };
-}
-
-cv::Point2f Preview::transform(const cv::Point2f& p) const
-{
- return { (p.x - offset_.x)*scale_ , (p.y - offset_.y)*scale_ };
-}
-
-float Preview::transform(float s) const
-{
- return s * scale_;
-}
-
NeuralNetTracker::NeuralNetTracker()
{
opencv_init();
+ neuralnet_tracker_tests::run();
}
@@ -921,6 +494,7 @@ bool NeuralNetTracker::load_and_initialize_model()
<< e.what();
return false;
}
+
return true;
}
@@ -960,46 +534,6 @@ bool NeuralNetTracker::open_camera()
}
-void NeuralNetTracker::set_intrinsics()
-{
- const int w = grayscale_.cols, h = grayscale_.rows;
- const double diag_fov = settings_.fov * M_PI / 180.;
- const double fov_w = 2.*atan(tan(diag_fov/2.)/sqrt(1. + h/(double)w * h/(double)w));
- const double fov_h = 2.*atan(tan(diag_fov/2.)/sqrt(1. + w/(double)h * w/(double)h));
- const double focal_length_w = 1. / tan(.5 * fov_w);
- const double focal_length_h = 1. / tan(.5 * fov_h);
-
- intrinsics_.fov_h = fov_h;
- intrinsics_.fov_w = fov_w;
- intrinsics_.focal_length_w = focal_length_w;
- intrinsics_.focal_length_h = focal_length_h;
-}
-
-
-class GuardedThreadCountSwitch
-{
- int old_num_threads_cv_ = 1;
- int old_num_threads_omp_ = 1;
- public:
- GuardedThreadCountSwitch(int num_threads)
- {
- old_num_threads_cv_ = cv::getNumThreads();
- old_num_threads_omp_ = omp_get_num_threads();
- omp_set_num_threads(num_threads);
- cv::setNumThreads(num_threads);
- }
-
- ~GuardedThreadCountSwitch()
- {
- omp_set_num_threads(old_num_threads_omp_);
- cv::setNumThreads(old_num_threads_cv_);
- }
-
- GuardedThreadCountSwitch(const GuardedThreadCountSwitch&) = delete;
- GuardedThreadCountSwitch& operator=(const GuardedThreadCountSwitch&) = delete;
-};
-
-
void NeuralNetTracker::run()
{
preview_.init(*video_widget_);
@@ -1055,7 +589,7 @@ void NeuralNetTracker::run()
}
}
- set_intrinsics();
+ intrinsics_ = make_intrinsics(grayscale_, settings_);
detect();
@@ -1113,7 +647,7 @@ void NeuralNetTracker::data(double *data)
Affine tmp = [&]()
{
QMutexLocker lck(&mtx_);
- return pose_;
+ return last_pose_affine_;
}();
const auto& mx = tmp.R.col(0);
@@ -1140,7 +674,7 @@ void NeuralNetTracker::data(double *data)
Affine NeuralNetTracker::pose()
{
QMutexLocker lck(&mtx_);
- return pose_;
+ return last_pose_affine_;
}
std::tuple<cv::Size,double, double> NeuralNetTracker::stats() const
diff --git a/tracker-neuralnet/ftnoir_tracker_neuralnet.h b/tracker-neuralnet/ftnoir_tracker_neuralnet.h
index 9b481186..9e0374da 100644
--- a/tracker-neuralnet/ftnoir_tracker_neuralnet.h
+++ b/tracker-neuralnet/ftnoir_tracker_neuralnet.h
@@ -7,6 +7,11 @@
#pragma once
+#include "ui_neuralnet-trackercontrols.h"
+#include "model_adapters.h"
+#include "deadzone_filter.h"
+#include "preview.h"
+
#include "options/options.hpp"
#include "api/plugin-api.hpp"
#include "cv/video-widget.hpp"
@@ -27,13 +32,9 @@
#include <cinttypes>
#include <array>
-#include <onnxruntime_cxx_api.h>
-
#include <opencv2/core.hpp>
-#include <opencv2/core/types.hpp>
#include <opencv2/imgproc.hpp>
-#include "ui_neuralnet-trackercontrols.h"
namespace neuralnet_tracker_ns
{
@@ -81,6 +82,8 @@ struct Settings : opts {
value<bool> use_mjpeg { b, "use-mjpeg", false };
value<int> num_threads { b, "num-threads", 1 };
value<int> resolution { b, "force-resolution", 0 };
+ value<double> deadzone_size { b, "deadzone-size", 1. };
+ value<double> deadzone_hardness { b, "deadzone-hardness", 1.5 };
Settings();
};
@@ -94,101 +97,6 @@ struct CamIntrinsics
};
-class Localizer
-{
- public:
- Localizer(Ort::MemoryInfo &allocator_info,
- Ort::Session &&session);
-
- // Returns bounding wrt image coordinate of the input image
- // The preceeding float is the score for being a face normalized to [0,1].
- std::pair<float, cv::Rect2f> run(
- const cv::Mat &frame);
-
- double last_inference_time_millis() const;
- private:
- inline static constexpr int INPUT_IMG_WIDTH = 288;
- inline static constexpr int INPUT_IMG_HEIGHT = 224;
- Ort::Session session_{nullptr};
- // Inputs / outputs
- cv::Mat scaled_frame_{}, input_mat_{};
- Ort::Value input_val_{nullptr}, output_val_{nullptr};
- std::array<float, 5> results_;
- double last_inference_time_ = 0;
-};
-
-
-class PoseEstimator
-{
- public:
- struct Face
- {
- std::array<float,4> rotation; // Quaternion, (w, x, y, z)
- cv::Rect2f box;
- cv::Point2f center;
- float size;
- };
-
- PoseEstimator(Ort::MemoryInfo &allocator_info,
- Ort::Session &&session);
- /** Inference
- *
- * Coordinates are defined wrt. the image space of the input `frame`.
- * X goes right, Z (depth) into the image, Y points down (like pixel coordinates values increase from top to bottom)
- */
- std::optional<Face> run(const cv::Mat &frame, const cv::Rect &box);
- // Returns an image compatible with the 'frame' image for displaying.
- cv::Mat last_network_input() const;
- double last_inference_time_millis() const;
- private:
- // Operates on the private image data members
- int find_input_intensity_90_pct_quantile() const;
-
- int64_t model_version_ = 0; // Queried meta data from the ONNX file
- Ort::Session session_{nullptr}; // ONNX's runtime context for running the model
- Ort::Allocator allocator_; // Memory allocator for tensors
- // Inputs
- cv::Mat scaled_frame_{}, input_mat_{}; // Input. One is the original crop, the other is rescaled (?)
- std::vector<Ort::Value> input_val_; // Tensors to put into the model
- std::vector<const char*> input_names_; // Refers to the names in the onnx model.
- // Outputs
- cv::Vec<float, 3> output_coord_{}; // 2d Coordinate and head size output.
- cv::Vec<float, 4> output_quat_{}; // Quaternion output
- cv::Vec<float, 4> output_box_{}; // Bounding box output
- std::vector<Ort::Value> output_val_; // Tensors to put the model outputs in.
- std::vector<const char*> output_names_; // Refers to the names in the onnx model.
- size_t num_recurrent_states_ = 0;
- double last_inference_time_ = 0;
-};
-
-
-class Preview
-{
-public:
- void init(const cv_video_widget& widget);
- void copy_video_frame(const cv::Mat& frame);
- void draw_gizmos(
- const std::optional<PoseEstimator::Face> &face,
- const Affine& pose,
- const std::optional<cv::Rect2f>& last_roi,
- const std::optional<cv::Rect2f>& last_localizer_roi,
- const cv::Point2f& neckjoint_position);
- void overlay_netinput(const cv::Mat& netinput);
- void draw_fps(double fps, double last_inference_time);
- void copy_to_widget(cv_video_widget& widget);
-private:
- // Transform from camera frame to preview
- cv::Rect2f transform(const cv::Rect2f& r) const;
- cv::Point2f transform(const cv::Point2f& p) const;
- float transform(float s) const;
-
- cv::Mat preview_image_;
- cv::Size preview_size_ = { 0, 0 };
- float scale_ = 1.f;
- cv::Point2f offset_ = { 0.f, 0.f};
-};
-
-
class NeuralNetTracker : protected virtual QThread, public ITracker
{
Q_OBJECT
@@ -214,7 +122,10 @@ private:
const std::optional<PoseEstimator::Face> &face,
const Affine& pose);
void update_fps(double dt);
- Affine compute_pose(const PoseEstimator::Face &face) const;
+ // Secretly applies filtering while computing the pose in 3d space.
+ QuatPose compute_filtered_pose(const PoseEstimator::Face &face);
+ // Compute the pose in 3d space taking the network outputs
+ QuatPose transform_to_world_pose(const cv::Quatf &face_rotation, const cv::Point2f& face_xy, const float face_size) const;
Settings settings_;
std::optional<Localizer> localizer_;
@@ -227,7 +138,7 @@ private:
std::array<cv::Mat,2> downsized_original_images_ = {}; // Image pyramid
std::optional<cv::Rect2f> last_localizer_roi_;
std::optional<cv::Rect2f> last_roi_;
- static constexpr float HEAD_SIZE_MM = 200.f;
+ static constexpr float HEAD_SIZE_MM = 200.f; // In the vertical. Approximately.
mutable QMutex stats_mtx_;
double fps_ = 0;
@@ -238,8 +149,9 @@ private:
int num_threads_ = 1;
bool is_visible_ = true;
- QMutex mtx_; // Protects the pose
- Affine pose_;
+ QMutex mtx_ = {}; // Protects the pose
+ std::optional<QuatPose> last_pose_ = {};
+ Affine last_pose_affine_ = {};
Preview preview_;
std::unique_ptr<cv_video_widget> video_widget_;
@@ -288,6 +200,15 @@ class NeuralNetMetadata : public Metadata
} // neuralnet_tracker_ns
+
+namespace neuralnet_tracker_tests
+{
+
+void run();
+
+}
+
+
using neuralnet_tracker_ns::NeuralNetTracker;
using neuralnet_tracker_ns::NeuralNetDialog;
using neuralnet_tracker_ns::NeuralNetMetadata;
diff --git a/tracker-neuralnet/model_adapters.cpp b/tracker-neuralnet/model_adapters.cpp
new file mode 100644
index 00000000..af599321
--- /dev/null
+++ b/tracker-neuralnet/model_adapters.cpp
@@ -0,0 +1,433 @@
+#include "model_adapters.h"
+
+#include "compat/timer.hpp"
+
+#include <opencv2/core.hpp>
+#include <opencv2/core/quaternion.hpp>
+#include <opencv2/imgproc.hpp>
+
+#include <QDebug>
+
+
+namespace neuralnet_tracker_ns
+{
+
+
+float sigmoid(float x)
+{
+ return 1.f/(1.f + std::exp(-x));
+}
+
+
+// Defined in ftnoir_tracker_neuralnet.cpp
+// Normally we wouldn't need it here. However ... see below.
+cv::Quatf image_to_world(cv::Quatf q);
+
+
+cv::Quatf world_to_image(cv::Quatf q)
+{
+ // It's its own inverse.
+ return image_to_world(q);
+}
+
+
+cv::Rect2f unnormalize(const cv::Rect2f &r, int h, int w)
+{
+ auto unnorm = [](float x) -> float { return 0.5*(x+1); };
+ auto tl = r.tl();
+ auto br = r.br();
+ auto x0 = unnorm(tl.x)*w;
+ auto y0 = unnorm(tl.y)*h;
+ auto x1 = unnorm(br.x)*w;
+ auto y1 = unnorm(br.y)*h;
+ return {
+ x0, y0, x1-x0, y1-y0
+ };
+}
+
+
+// Returns width and height of the input tensor, or throws.
+// Expects the model to take one tensor as input that must
+// have the shape B x C x H x W, where B=C=1.
+cv::Size get_input_image_shape(const Ort::Session &session)
+{
+ if (session.GetInputCount() < 1)
+ throw std::invalid_argument("Model must take at least one input tensor");
+ const std::vector<std::int64_t> shape =
+ session.GetInputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape();
+ if (shape.size() != 4)
+ throw std::invalid_argument("Model takes the input tensor in the wrong shape");
+ return { static_cast<int>(shape[3]), static_cast<int>(shape[2]) };
+}
+
+
+Ort::Value create_tensor(const Ort::TypeInfo& info, Ort::Allocator& alloc)
+{
+ const auto shape = info.GetTensorTypeAndShapeInfo().GetShape();
+ auto t = Ort::Value::CreateTensor<float>(
+ alloc, shape.data(), shape.size());
+ memset(t.GetTensorMutableData<float>(), 0, sizeof(float)*info.GetTensorTypeAndShapeInfo().GetElementCount());
+ return t;
+}
+
+
+int find_input_intensity_quantile(const cv::Mat& frame, float percentage)
+{
+ const int channels[] = { 0 };
+ const int hist_size[] = { 256 };
+ float range[] = { 0, 256 };
+ const float* ranges[] = { range };
+ cv::Mat hist;
+ cv::calcHist(&frame, 1, channels, cv::Mat(), hist, 1, hist_size, ranges, true, false);
+ int gray_level = 0;
+ const int num_pixels_quantile = frame.total()*percentage*0.01f;
+ int num_pixels_accum = 0;
+ for (int i=0; i<hist_size[0]; ++i)
+ {
+ num_pixels_accum += hist.at<float>(i);
+ if (num_pixels_accum > num_pixels_quantile)
+ {
+ gray_level = i;
+ break;
+ }
+ }
+ return gray_level;
+}
+
+
+// Automatic brightness adjustment. Scales brightness to lie between -.5 and 0.5, roughly.
+void normalize_brightness(const cv::Mat& frame, cv::Mat& out)
+{
+ const float pct = 90;
+
+ const int brightness = find_input_intensity_quantile(frame, pct);
+
+ const double alpha = brightness<127 ? (pct/100.f*0.5f/std::max(5,brightness)) : 1./255;
+ const double beta = -0.5;
+
+ frame.convertTo(out, CV_32F, alpha, beta);
+}
+
+
+
+Localizer::Localizer(Ort::MemoryInfo &allocator_info, Ort::Session &&session) :
+ session_{std::move(session)},
+ scaled_frame_(INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH, CV_8U),
+ input_mat_(INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH, CV_32F)
+{
+ // Only works when input_mat does not reallocated memory ...which it should not.
+ // Non-owning memory reference to input_mat?
+ // Note: shape = (bach x channels x h x w)
+ const std::int64_t input_shape[4] = { 1, 1, INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH };
+ input_val_ = Ort::Value::CreateTensor<float>(allocator_info, input_mat_.ptr<float>(0), input_mat_.total(), input_shape, 4);
+
+ const std::int64_t output_shape[2] = { 1, 5 };
+ output_val_ = Ort::Value::CreateTensor<float>(allocator_info, results_.data(), results_.size(), output_shape, 2);
+}
+
+
+std::pair<float, cv::Rect2f> Localizer::run(
+ const cv::Mat &frame)
+{
+ auto p = input_mat_.ptr(0);
+
+ cv::resize(frame, scaled_frame_, { INPUT_IMG_WIDTH, INPUT_IMG_HEIGHT }, 0, 0, cv::INTER_AREA);
+ scaled_frame_.convertTo(input_mat_, CV_32F, 1./255., -0.5);
+
+ assert (input_mat_.ptr(0) == p);
+ assert (!input_mat_.empty() && input_mat_.isContinuous());
+ assert (input_mat_.cols == INPUT_IMG_WIDTH && input_mat_.rows == INPUT_IMG_HEIGHT);
+
+ const char* input_names[] = {"x"};
+ const char* output_names[] = {"logit_box"};
+
+ Timer t; t.start();
+
+ session_.Run(Ort::RunOptions{nullptr}, input_names, &input_val_, 1, output_names, &output_val_, 1);
+
+ last_inference_time_ = t.elapsed_ms();
+
+ const cv::Rect2f roi = unnormalize(cv::Rect2f{
+ results_[1],
+ results_[2],
+ results_[3]-results_[1], // Width
+ results_[4]-results_[2] // Height
+ }, frame.rows, frame.cols);
+ const float score = sigmoid(results_[0]);
+
+ return { score, roi };
+}
+
+
+double Localizer::last_inference_time_millis() const
+{
+ return last_inference_time_;
+}
+
+
+PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&session)
+ : model_version_{session.GetModelMetadata().GetVersion()}
+ , session_{std::move(session)}
+ , allocator_{session_, allocator_info}
+{
+ using namespace std::literals::string_literals;
+
+ if (session_.GetOutputCount() < 2)
+ throw std::runtime_error("Invalid Model: must have at least two outputs");
+
+ // WARNING UB .. but still ...
+ // If the model was saved without meta data, it seems the version field is uninitialized.
+ // In that case reading from it is UB. However, in practice we will just get some arbitrary number
+ // which is hopefully different from the numbers used by models where the version is set.
+ if (model_version_ != 2 && model_version_ != 3)
+ model_version_ = 1;
+
+ const cv::Size input_image_shape = get_input_image_shape(session_);
+
+ scaled_frame_ = cv::Mat(input_image_shape, CV_8U, cv::Scalar(0));
+ input_mat_ = cv::Mat(input_image_shape, CV_32F, cv::Scalar(0.f));
+
+ {
+ const std::int64_t input_shape[4] = { 1, 1, input_image_shape.height, input_image_shape.width };
+ input_val_.push_back(
+ Ort::Value::CreateTensor<float>(allocator_info, input_mat_.ptr<float>(0), input_mat_.total(), input_shape, 4));
+ }
+
+ struct TensorSpec
+ {
+ std::vector<int64_t> shape;
+ float* buffer = nullptr;
+ size_t element_count = 0;
+ bool available = false;
+ };
+
+ std::unordered_map<std::string, TensorSpec> understood_outputs = {
+ { "pos_size", TensorSpec{ { 1, 3 }, &output_coord_[0], output_coord_.rows } },
+ { "quat", TensorSpec{ { 1, 4}, &output_quat_[0], output_quat_.rows } },
+ { "box", TensorSpec{ { 1, 4}, &output_box_[0], output_box_.rows } },
+ { "rotaxis_scales_tril", TensorSpec{ {1, 3, 3}, output_rotaxis_scales_tril_.val, 9 }},
+ { "rotaxis_std", TensorSpec{ {1, 3, 3}, output_rotaxis_scales_tril_.val, 9 }}, // TODO: Delete when old models aren't used any more
+ { "eyes", TensorSpec{ { 1, 2}, output_eyes_.val, output_eyes_.rows }},
+ { "pos_size_std", TensorSpec{ {1, 3}, output_coord_scales_.val, output_coord_scales_.rows}},
+ { "pos_size_scales", TensorSpec{ {1, 3}, output_coord_scales_.val, output_coord_scales_.rows}},
+ //{ "box_std", TensorSpec{ {1, 4}, output_box_scales_.val, output_box_scales_ .rows}}
+ };
+
+ qDebug() << "Pose model inputs (" << session_.GetInputCount() << ")";
+ qDebug() << "Pose model outputs (" << session_.GetOutputCount() << "):";
+ for (size_t i=0; i<session_.GetOutputCount(); ++i)
+ {
+ const char* name = session_.GetOutputName(i, allocator_);
+ const auto& output_info = session_.GetOutputTypeInfo(i);
+ const auto& onnx_tensor_spec = output_info.GetTensorTypeAndShapeInfo();
+ auto my_tensor_spec = understood_outputs.find(name);
+
+ qDebug() << "\t" << name << " (" << onnx_tensor_spec.GetShape() << ") dtype: " << onnx_tensor_spec.GetElementType() << " " <<
+ (my_tensor_spec != understood_outputs.end() ? "ok" : "unknown");
+
+ if (my_tensor_spec != understood_outputs.end())
+ {
+ TensorSpec& t = my_tensor_spec->second;
+ if (onnx_tensor_spec.GetShape() != t.shape ||
+ onnx_tensor_spec.GetElementType() != Ort::TypeToTensorType<float>::type)
+ throw std::runtime_error("Invalid output tensor spec for "s + name);
+ output_val_.push_back(Ort::Value::CreateTensor<float>(
+ allocator_info, t.buffer, t.element_count, t.shape.data(), t.shape.size()));
+ t.available = true;
+ }
+ else
+ {
+ // Create tensor regardless and ignore output
+ output_val_.push_back(create_tensor(output_info, allocator_));
+ }
+ output_names_.push_back(name);
+ }
+
+ has_uncertainty_ = understood_outputs.at("rotaxis_scales_tril").available ||
+ understood_outputs.at("rotaxis_std").available;
+ has_uncertainty_ &= understood_outputs.at("pos_size_std").available ||
+ understood_outputs.at("pos_size_scales").available;
+ //has_uncertainty_ &= understood_outputs.at("box_std").available;
+ has_eye_closed_detection_ = understood_outputs.at("eyes").available;
+
+ // FIXME: Recurrent states
+
+ // size_t num_regular_outputs = 2;
+
+ // num_recurrent_states_ = session_.GetInputCount()-1;
+ // if (session_.GetOutputCount()-num_regular_outputs != num_recurrent_states_)
+ // throw std::runtime_error("Invalid Model: After regular inputs and outputs the model must have equal number of inputs and outputs for tensors holding hidden states of recurrent layers.");
+
+ // // Create tensors for recurrent state
+ // for (size_t i = 0; i < num_recurrent_states_; ++i)
+ // {
+ // const auto& input_info = session_.GetInputTypeInfo(1+i);
+ // const auto& output_info = session_.GetOutputTypeInfo(num_regular_outputs+i);
+ // if (input_info.GetTensorTypeAndShapeInfo().GetShape() !=
+ // output_info.GetTensorTypeAndShapeInfo().GetShape())
+ // throw std::runtime_error("Invalid Model: Tensors for recurrent hidden states should have same shape on intput and output");
+ // input_val_.push_back(create_tensor(input_info, allocator_));
+ // output_val_.push_back(create_tensor(output_info, allocator_));
+ // }
+
+ for (size_t i = 0; i < session_.GetInputCount(); ++i)
+ {
+ input_names_.push_back(session_.GetInputName(i, allocator_));
+ }
+
+ assert (input_names_.size() == input_val_.size());
+ assert (output_names_.size() == output_val_.size());
+}
+
+
+std::optional<PoseEstimator::Face> PoseEstimator::run(
+ const cv::Mat &frame, const cv::Rect &box)
+{
+ cv::Mat cropped;
+
+ const int patch_size = std::max(box.width, box.height)*1.05;
+ const cv::Point2f patch_center = {
+ std::clamp<float>(box.x + 0.5f*box.width, 0.f, frame.cols),
+ std::clamp<float>(box.y + 0.5f*box.height, 0.f, frame.rows)
+ };
+ cv::getRectSubPix(frame, {patch_size, patch_size}, patch_center, cropped);
+
+ // Will get failure if patch_center is outside image boundaries settings.
+ // Have to catch this case.
+ if (cropped.rows != patch_size || cropped.cols != patch_size)
+ return {};
+
+ [[maybe_unused]] auto* p = input_mat_.ptr(0);
+
+ cv::resize(cropped, scaled_frame_, scaled_frame_.size(), 0, 0, cv::INTER_AREA);
+
+ normalize_brightness(scaled_frame_, input_mat_);
+
+ assert (input_mat_.ptr(0) == p);
+ assert (!input_mat_.empty() && input_mat_.isContinuous());
+
+ Timer t; t.start();
+
+ try
+ {
+ session_.Run(
+ Ort::RunOptions{ nullptr },
+ input_names_.data(),
+ input_val_.data(),
+ input_val_.size(),
+ output_names_.data(),
+ output_val_.data(),
+ output_val_.size());
+ }
+ catch (const Ort::Exception &e)
+ {
+ qDebug() << "Failed to run the model: " << e.what();
+ return {};
+ }
+
+ for (size_t i = 0; i<num_recurrent_states_; ++i)
+ {
+ // Next step, the current output becomes the input.
+ // Thus we realize the recurrent connection.
+ // Only swaps the internal pointers. There is no copy of data.
+ std::swap(
+ output_val_[output_val_.size()-num_recurrent_states_+i],
+ input_val_[input_val_.size()-num_recurrent_states_+i]);
+ }
+
+ // FIXME: Execution time fluctuates wildly. 19 to 26 msec. Why?
+ // The instructions are always the same. Maybe a memory allocation
+ // issue. The ONNX api suggests that tensor are allocated in an
+ // arena. Does that matter? Maybe the issue is something else?
+
+ last_inference_time_ = t.elapsed_ms();
+
+ // Perform coordinate transformation.
+ // From patch-local normalized in [-1,1] to
+ // frame unnormalized pixel coordinatesettings.
+
+ const cv::Point2f center = patch_center +
+ (0.5f*patch_size)*cv::Point2f{output_coord_[0], output_coord_[1]};
+
+ cv::Point2f center_stddev = {
+ (0.5f*patch_size)*output_coord_scales_[0],
+ (0.5f*patch_size)*output_coord_scales_[1] };
+
+ const float size = patch_size*0.5f*output_coord_[2];
+
+ float size_stddev = patch_size*0.5f*output_coord_scales_[2];
+
+ // Following Eigen which uses quat components in the order w, x, y, z.
+ // As does OpenCV
+ cv::Quatf rotation = {
+ output_quat_[3],
+ output_quat_[0],
+ output_quat_[1],
+ output_quat_[2] };
+
+ // Should be lower triangular. If not maybe something is wrong with memory layout ... or the model.
+ assert(output_rotaxis_scales_tril_(0, 1) == 0);
+ assert(output_rotaxis_scales_tril_(0, 2) == 0);
+ assert(output_rotaxis_scales_tril_(1, 2) == 0);
+
+ cv::Matx33f rotaxis_scales_tril = output_rotaxis_scales_tril_;
+
+ if (model_version_ < 2)
+ {
+ // Due to a change in coordinate conventions
+ rotation = world_to_image(rotation);
+ }
+
+ const cv::Rect2f outbox = {
+ patch_center.x + (0.5f*patch_size)*output_box_[0],
+ patch_center.y + (0.5f*patch_size)*output_box_[1],
+ 0.5f*patch_size*(output_box_[2]-output_box_[0]),
+ 0.5f*patch_size*(output_box_[3]-output_box_[1])
+ };
+ // const RoiCorners outbox = {
+ // patch_center + 0.5f*patch_size*cv::Point2f{output_box_[0], output_box_[1]},
+ // patch_center + 0.5f*patch_size*cv::Point2f{output_box_[2], output_box_[3]}
+ // };
+ // RoiCorners outbox_stddev = {
+ // 0.5f*patch_size*cv::Point2f{output_box_scales_[0], output_box_scales_[1]},
+ // 0.5f*patch_size*cv::Point2f{output_box_scales_[2], output_box_scales_[3]}
+ // };
+
+ // Because the model is sensitive to closing eyes we increase the uncertainty
+ // a lot to make the subsequent filtering smooth the output more. This should suppress
+ // "twitching" when the user blinks.
+ if (has_eye_closed_detection_)
+ {
+ const float eye_open = std::min(output_eyes_[0], output_eyes_[1]);
+ const float increase_factor = 1.f + 10.f * std::pow(1. - eye_open,4.f);
+ rotaxis_scales_tril *= increase_factor;
+ size_stddev *= increase_factor;
+ center_stddev *= increase_factor;
+ }
+
+ return std::optional<Face>({
+ rotation, rotaxis_scales_tril, outbox, center, center_stddev, size, size_stddev
+ });
+}
+
+
+cv::Mat PoseEstimator::last_network_input() const
+{
+ assert(!input_mat_.empty());
+ cv::Mat ret;
+ input_mat_.convertTo(ret, CV_8U, 255., 127.);
+ cv::cvtColor(ret, ret, cv::COLOR_GRAY2RGB);
+ return ret;
+}
+
+
+double PoseEstimator::last_inference_time_millis() const
+{
+ return last_inference_time_;
+}
+
+
+
+
+
+} // namespace neuralnet_tracker_ns \ No newline at end of file
diff --git a/tracker-neuralnet/model_adapters.h b/tracker-neuralnet/model_adapters.h
new file mode 100644
index 00000000..3fbfb861
--- /dev/null
+++ b/tracker-neuralnet/model_adapters.h
@@ -0,0 +1,102 @@
+#pragma once
+
+#include <optional>
+#include <array>
+#include <vector>
+
+#include <onnxruntime_cxx_api.h>
+#include <opencv2/core.hpp>
+#include "opencv_contrib.h"
+
+
+namespace neuralnet_tracker_ns
+{
+
+// Generally useful sigmoid function
+float sigmoid(float x);
+
+
+class Localizer
+{
+ public:
+ Localizer(Ort::MemoryInfo &allocator_info,
+ Ort::Session &&session);
+
+ // Returns bounding wrt image coordinate of the input image
+ // The preceeding float is the score for being a face normalized to [0,1].
+ std::pair<float, cv::Rect2f> run(
+ const cv::Mat &frame);
+
+ double last_inference_time_millis() const;
+ private:
+ inline static constexpr int INPUT_IMG_WIDTH = 288;
+ inline static constexpr int INPUT_IMG_HEIGHT = 224;
+ Ort::Session session_{nullptr};
+ // Inputs / outputs
+ cv::Mat scaled_frame_{}, input_mat_{};
+ Ort::Value input_val_{nullptr}, output_val_{nullptr};
+ std::array<float, 5> results_;
+ double last_inference_time_ = 0;
+};
+
+
+class PoseEstimator
+{
+ public:
+ struct Face
+ {
+ cv::Quatf rotation;
+ cv::Matx33f rotaxis_cov_tril; // Lower triangular factor of Cholesky decomposition
+ cv::Rect2f box;
+ cv::Point2f center;
+ cv::Point2f center_stddev;
+ float size;
+ float size_stddev;
+ };
+
+ PoseEstimator(Ort::MemoryInfo &allocator_info,
+ Ort::Session &&session);
+ /** Inference
+ *
+ * Coordinates are defined wrt. the image space of the input `frame`.
+ * X goes right, Z (depth) into the image, Y points down (like pixel coordinates values increase from top to bottom)
+ */
+ std::optional<Face> run(const cv::Mat &frame, const cv::Rect &box);
+ // Returns an image compatible with the 'frame' image for displaying.
+ cv::Mat last_network_input() const;
+ double last_inference_time_millis() const;
+ bool has_uncertainty() const { return has_uncertainty_; }
+
+ private:
+ int64_t model_version_ = 0; // Queried meta data from the ONNX file
+ Ort::Session session_{nullptr}; // ONNX's runtime context for running the model
+ Ort::Allocator allocator_; // Memory allocator for tensors
+ // Inputs
+ cv::Mat scaled_frame_{}, input_mat_{}; // Input. One is the original crop, the other is rescaled (?)
+ std::vector<Ort::Value> input_val_; // Tensors to put into the model
+ std::vector<const char*> input_names_; // Refers to the names in the onnx model.
+ // Outputs
+ cv::Vec<float, 3> output_coord_{}; // 2d Coordinate and head size output.
+ cv::Vec<float, 4> output_quat_{}; // Quaternion output
+ cv::Vec<float, 4> output_box_{}; // Bounding box output
+ cv::Matx33f output_rotaxis_scales_tril_{}; // Lower triangular matrix of LLT factorization of covariance of rotation vector as offset from output quaternion
+ cv::Vec<float, 2> output_eyes_{};
+ cv::Vec<float, 3> output_coord_scales_{};
+ std::vector<Ort::Value> output_val_; // Tensors to put the model outputs in.
+ std::vector<const char*> output_names_; // Refers to the names in the onnx model.
+ // More bookkeeping
+ size_t num_recurrent_states_ = 0;
+ double last_inference_time_ = 0;
+ bool has_uncertainty_ = false;
+ bool has_eye_closed_detection_ = false;
+};
+
+
+// Finds the intensity where x percent of pixels have less intensity than that.
+int find_input_intensity_quantile(const cv::Mat& frame, float percentage);
+
+// Adjust brightness levels to full range and scales the value range to [-0.5, 0.5]
+void normalize_brightness(const cv::Mat& frame, cv::Mat& out);
+
+
+} // namespace neuralnet_tracker_ns \ No newline at end of file
diff --git a/tracker-neuralnet/opencv_contrib.h b/tracker-neuralnet/opencv_contrib.h
new file mode 100644
index 00000000..af92c12f
--- /dev/null
+++ b/tracker-neuralnet/opencv_contrib.h
@@ -0,0 +1,120 @@
+#pragma once
+
+#include <opencv2/core.hpp>
+#include <opencv2/core/base.hpp>
+#include <opencv2/core/quaternion.hpp>
+
+// Well eventually it might be a contribution
+
+namespace cvcontrib
+{
+
+
+template<class T>
+cv::Point_<T> as_point(const cv::Size_<T>& s)
+{
+ return { s.width, s.height };
+}
+
+
+template<class T>
+cv::Size_<T> as_size(const cv::Point_<T>& p)
+{
+ return { p.x, p.y };
+}
+
+
+template<int n, int m>
+inline bool allfinite(const cv::Matx<float, n, m> &mat)
+{
+ const size_t sz = mat.rows*mat.cols;
+ for (size_t i=0; i<sz; ++i)
+ if (!std::isfinite(mat.val[i]))
+ return false;
+ return true;
+}
+
+
+// Because compiler refuses to convert it automatically
+template<int n>
+inline cv::Vec<float, n> to_vec(const cv::Matx<float, n, 1>& m)
+{
+ return cv::Vec<float,n>{m.val};
+}
+
+
+template<int n, int m, int o>
+inline void set_minor(cv::Vec<float, m> &dst, const int startrow, const cv::Matx<float, o, 1> &src)
+{
+ assert (startrow>=0 && startrow+n <= dst.rows);
+ for (int row=startrow, i=0; row<startrow+n; ++row,++i)
+ {
+ dst[row] = src(i,0);
+ }
+}
+
+
+template<int nrows, int ncols, int m, int n>
+inline void set_minor(cv::Matx<float, m, n>& dst, const int startrow, int startcol, const cv::Matx<float, nrows, ncols> &src)
+{
+ assert (startrow>=0 && startrow+nrows <= dst.rows);
+ assert (startcol>=0 && startcol+ncols <= dst.cols);
+ for (int row=startrow, i=0; row<startrow+nrows; ++row,++i)
+ {
+ for (int col=startcol, j=0; col<startcol+ncols; ++col,++j)
+ {
+ dst(row, col) = src(i,j);
+ }
+ }
+}
+
+
+inline cv::Quatf identity_quat()
+{
+ return cv::Quatf(1,0,0,0);
+}
+
+
+inline cv::Vec3f toRotVec(const cv::Quatf& q)
+{
+ // This is an improved implementation
+#if 1
+ // w = cos(alpha/2)
+ // xyz = sin(alpha/2)*axis
+ static constexpr float eps = 1.e-12;
+ const cv::Vec3f xyz{q.x, q.y, q.z};
+ const float len = cv::norm(xyz);
+ const float angle = std::atan2(len, q.w)*2.f;
+ return xyz*(angle/(len+eps));
+#else
+ // The opencv implementation fails even the simplest test:
+ // out = toRVec(cv::Quatf{1., 0., 0., 0. });
+ // ASSERT_TRUE(std::isfinite(out[0]) && std::isfinite(out[1]) && std::isfinite(out[2]));
+ return q.toRotVec();
+#endif
+}
+
+
+inline cv::Vec3f rotate(const cv::Quatf& q, const cv::Vec3f &v)
+{
+ const auto r = q * cv::Quatf{0., v[0], v[1], v[2]} * q.conjugate();
+ return { r.x, r.y, r.z };
+}
+
+
+template<int n>
+inline cv::Matx<float, n, n> cholesky(const cv::Matx<float, n, n>& mat)
+{
+ cv::Matx<float, n, n> l = mat;
+ // Der Code ist die Doku!
+ // https://github.com/opencv/opencv/blob/4.5.4/modules/core/src/matrix_decomp.cpp#L95
+ cv::Cholesky(l.val, l.cols * sizeof(float), n, nullptr, 0, 0);
+ // It doesn't clear the upper triangle so we do it for it.
+ for (int row=0; row<n; ++row)
+ for (int col=row+1; col<n; ++col)
+ l(row, col) = 0.f;
+ return l;
+}
+
+
+} // namespace cvcontrib \ No newline at end of file
diff --git a/tracker-neuralnet/preview.cpp b/tracker-neuralnet/preview.cpp
new file mode 100644
index 00000000..76a6bbc0
--- /dev/null
+++ b/tracker-neuralnet/preview.cpp
@@ -0,0 +1,135 @@
+#include "preview.h"
+
+
+namespace neuralnet_tracker_ns
+{
+
+
+cv::Rect make_crop_rect_for_aspect(const cv::Size &size, int aspect_w, int aspect_h)
+{
+ auto [w, h] = size;
+ if ( w*aspect_h > aspect_w*h )
+ {
+ // Image is too wide
+ const int new_w = (aspect_w*h)/aspect_h;
+ return cv::Rect((w - new_w)/2, 0, new_w, h);
+ }
+ else
+ {
+ const int new_h = (aspect_h*w)/aspect_w;
+ return cv::Rect(0, (h - new_h)/2, w, new_h);
+ }
+}
+
+
+
+
+void Preview::init(const cv_video_widget& widget)
+{
+ auto [w,h] = widget.preview_size();
+ preview_size_ = { w, h };
+}
+
+
+void Preview::copy_video_frame(const cv::Mat& frame)
+{
+ cv::Rect roi = make_crop_rect_for_aspect(frame.size(), preview_size_.width, preview_size_.height);
+
+ cv::resize(frame(roi), preview_image_, preview_size_, 0, 0, cv::INTER_NEAREST);
+
+ offset_ = { (float)-roi.x, (float)-roi.y };
+ scale_ = float(preview_image_.cols) / float(roi.width);
+}
+
+
+void Preview::draw_gizmos(
+ const std::optional<PoseEstimator::Face> &face,
+ const std::optional<cv::Rect2f>& last_roi,
+ const std::optional<cv::Rect2f>& last_localizer_roi,
+ const cv::Point2f& neckjoint_position)
+{
+ if (preview_image_.empty())
+ return;
+
+ if (last_roi)
+ {
+ const int col = 255;
+ cv::rectangle(preview_image_, transform(*last_roi), cv::Scalar(0, col, 0), /*thickness=*/1);
+ }
+ if (last_localizer_roi)
+ {
+ const int col = 255;
+ cv::rectangle(preview_image_, transform(*last_localizer_roi), cv::Scalar(col, 0, 255-col), /*thickness=*/1);
+ }
+
+ if (face)
+ {
+ if (face->size>=1.f)
+ cv::circle(preview_image_, static_cast<cv::Point>(transform(face->center)), int(transform(face->size)), cv::Scalar(255,255,255), 2);
+ cv::circle(preview_image_, static_cast<cv::Point>(transform(face->center)), 3, cv::Scalar(255,255,255), -1);
+
+ const cv::Matx33f R = face->rotation.toRotMat3x3(cv::QUAT_ASSUME_UNIT);
+
+ auto draw_coord_line = [&](int i, const cv::Scalar& color)
+ {
+ const float vx = R(0,i);
+ const float vy = R(1,i);
+ static constexpr float len = 100.f;
+ cv::Point q = face->center + len*cv::Point2f{vx, vy};
+ cv::line(preview_image_, static_cast<cv::Point>(transform(face->center)), static_cast<cv::Point>(transform(q)), color, 2);
+ };
+ draw_coord_line(0, {0, 0, 255});
+ draw_coord_line(1, {0, 255, 0});
+ draw_coord_line(2, {255, 0, 0});
+
+ // Draw the computed joint position
+ auto xy = transform(neckjoint_position);
+ cv::circle(preview_image_, cv::Point(xy.x,xy.y), 5, cv::Scalar(0,0,255), -1);
+ }
+
+
+}
+
+void Preview::overlay_netinput(const cv::Mat& netinput)
+{
+ if (netinput.empty())
+ return;
+
+ const int w = std::min(netinput.cols, preview_image_.cols);
+ const int h = std::min(netinput.rows, preview_image_.rows);
+ cv::Rect roi(0, 0, w, h);
+ netinput(roi).copyTo(preview_image_(roi));
+}
+
+void Preview::draw_fps(double fps, double last_inference_time)
+{
+ char buf[128];
+ ::snprintf(buf, sizeof(buf), "%d Hz, pose inference: %d ms", std::clamp(int(fps), 0, 9999), int(last_inference_time));
+ cv::putText(preview_image_, buf, cv::Point(10, preview_image_.rows-10), cv::FONT_HERSHEY_PLAIN, 1, cv::Scalar(0, 255, 0), 1);
+}
+
+
+void Preview::copy_to_widget(cv_video_widget& widget)
+{
+ if (preview_image_.rows > 0)
+ widget.update_image(preview_image_);
+}
+
+
+cv::Rect2f Preview::transform(const cv::Rect2f& r) const
+{
+ return { (r.x - offset_.x)*scale_, (r.y - offset_.y)*scale_, r.width*scale_, r.height*scale_ };
+}
+
+cv::Point2f Preview::transform(const cv::Point2f& p) const
+{
+ return { (p.x - offset_.x)*scale_ , (p.y - offset_.y)*scale_ };
+}
+
+float Preview::transform(float s) const
+{
+ return s * scale_;
+}
+
+
+} \ No newline at end of file
diff --git a/tracker-neuralnet/preview.h b/tracker-neuralnet/preview.h
new file mode 100644
index 00000000..adc12993
--- /dev/null
+++ b/tracker-neuralnet/preview.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2021 Michael Welter <michael@welter-4d.de>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ */
+
+#pragma once
+
+#include "model_adapters.h"
+
+#include "cv/video-widget.hpp"
+
+#include <optional>
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+
+
+namespace neuralnet_tracker_ns
+{
+
+/** Makes a maximum size cropping rect with the given aspect.
+* @param aspect_w: nominator of the aspect ratio
+* @param aspect_h: denom of the aspect ratio
+*/
+cv::Rect make_crop_rect_for_aspect(const cv::Size &size, int aspect_w, int aspect_h);
+
+
+/** This class is responsible for drawing the debug/info gizmos
+*
+* In addition there function to transform the inputs to the size of
+* the preview image which can be different from the camera frame.
+*/
+class Preview
+{
+public:
+ void init(const cv_video_widget& widget);
+ void copy_video_frame(const cv::Mat& frame);
+ void draw_gizmos(
+ const std::optional<PoseEstimator::Face> &face,
+ const std::optional<cv::Rect2f>& last_roi,
+ const std::optional<cv::Rect2f>& last_localizer_roi,
+ const cv::Point2f& neckjoint_position);
+ void overlay_netinput(const cv::Mat& netinput);
+ void draw_fps(double fps, double last_inference_time);
+ void copy_to_widget(cv_video_widget& widget);
+private:
+ // Transform from camera image to preview
+ cv::Rect2f transform(const cv::Rect2f& r) const;
+ cv::Point2f transform(const cv::Point2f& p) const;
+ float transform(float s) const;
+
+ cv::Mat preview_image_;
+ cv::Size preview_size_ = { 0, 0 };
+ float scale_ = 1.f;
+ cv::Point2f offset_ = { 0.f, 0.f};
+};
+
+} // neuralnet_tracker_ns \ No newline at end of file
diff --git a/tracker-neuralnet/tests.cpp b/tracker-neuralnet/tests.cpp
new file mode 100644
index 00000000..b1d2a6d0
--- /dev/null
+++ b/tracker-neuralnet/tests.cpp
@@ -0,0 +1,58 @@
+#include "model_adapters.h"
+
+#include <algorithm>
+#include <numeric>
+#include <cstdio>
+
+namespace neuralnet_tracker_tests
+{
+
+
+void assert_(bool ok, const std::string& msg)
+{
+ if (ok)
+ return;
+ std::cout << msg << std::endl;
+ std::exit(-1);
+}
+
+
+void test_find_input_intensity_quantile()
+{
+ cv::Mat data(10,10, CV_8UC1);
+ std::iota(data.begin<uint8_t>(), data.end<uint8_t>(), 0);
+
+ const float pct = 90;
+
+ const int val = neuralnet_tracker_ns::find_input_intensity_quantile(data, pct);
+
+ assert_(val == int(10*10*pct/100.f), "test_find_input_intensity_quantile failed");
+}
+
+
+void test_normalize_brightness()
+{
+ cv::Mat data(10,10, CV_8UC1);
+ std::iota(data.begin<uint8_t>(), data.end<uint8_t>(), 0);
+
+ cv::Mat out;
+ neuralnet_tracker_ns::normalize_brightness(data, out);
+
+ auto [minit,maxit] = std::minmax_element(out.begin<float>(),out.end<float>());
+ const auto minval = *minit;
+ const auto maxval = *maxit;
+ assert_(std::abs(minval + 0.5f) < 0.02, "test_normalize_brightness failed");
+ // If the brightest value is lower than half-max, it will be boosted to half-max.
+ // Otherwise it will just be rescaled to [-.5, 0.5 ]. Here we have the low-brightness case.
+ assert_(std::abs(maxval - 0.0f) < 0.02, "test_normalize_brightness failed");
+}
+
+
+void run()
+{
+ test_find_input_intensity_quantile();
+ test_normalize_brightness();
+}
+
+
+} \ No newline at end of file
diff --git a/tracker-neuralnet/unscented_trafo.h b/tracker-neuralnet/unscented_trafo.h
new file mode 100644
index 00000000..267aa969
--- /dev/null
+++ b/tracker-neuralnet/unscented_trafo.h
@@ -0,0 +1,132 @@
+#pragma once
+
+#include <algorithm>
+#include <opencv2/core.hpp>
+#include <opencv2/core/base.hpp>
+#include <opencv2/core/quaternion.hpp>
+
+#include <cmath>
+#include <vector>
+
+#include "opencv_contrib.h"
+
+namespace ukf_cv
+{
+
+using namespace cvcontrib;
+
+template<int dim, int otherdim = dim>
+using SigmaPoints = std::array<cv::Vec<float,otherdim>,dim*2+1>;
+
+
+// Ported from
+// https://filterpy.readthedocs.io/en/latest/_modules/filterpy/kalman/sigma_points.html
+// Excerpt from the original docu:
+// "
+
+// Generates sigma points and weights according to Van der Merwe's
+// 2004 dissertation[1] for the UnscentedKalmanFilter class.. It
+// parametizes the sigma points using alpha, beta, kappa terms, and
+// is the version seen in most publications.
+
+// Unless you know better, this should be your default choice.
+
+// alpha : float
+// Determins the spread of the sigma points around the mean.
+// Usually a small positive value (1e-3) according to [3].
+
+// beta : float
+// Incorporates prior knowledge of the distribution of the mean. For
+// Gaussian x beta=2 is optimal, according to [3].
+
+// kappa : float, default=0.0
+// Secondary scaling parameter usually set to 0 according to [4],
+// or to 3-n according to [5].
+
+// Reference
+// .. [1] R. Van der Merwe "Sigma-Point Kalman Filters for Probabilitic
+// Inference in Dynamic State-Space Models" (Doctoral dissertation)
+
+// "
+template<int dim>
+class MerweScaledSigmaPoints
+{
+public:
+ static constexpr int num_sigmas = 2*dim+1;
+
+ using Vector = cv::Vec<float,dim>;
+ using Matrix = cv::Matx<float,dim,dim>;
+
+ MerweScaledSigmaPoints(float alpha = 0.01, float beta = 2., int kappa = 3-dim)
+ {
+ lambda = alpha*alpha * (dim + kappa) - dim;
+ const float c = .5 / (dim + lambda);
+ Wc_i = c;
+ Wm_i = c;
+ Wm_0 = lambda / (dim+lambda);
+ Wc_0 = Wm_0 + (1.-alpha*alpha + beta);
+ }
+
+ SigmaPoints<dim> compute_sigmas(const Vector &mu, const Matrix &mat, bool is_tril_factor) const
+ {
+ const Matrix triu_factor = is_tril_factor ? mat.t() : cholesky(mat).t();
+
+ const Matrix U = triu_factor*std::sqrt(lambda+dim);
+
+ SigmaPoints<dim> sigmas;
+
+ sigmas[0] = mu;
+ for (int k=0; k<dim; ++k)
+ {
+ sigmas[k+1] = to_vec(mu + U.row(k).t());
+ sigmas[dim+k+1] = to_vec(mu - U.row(k).t());
+ }
+ return sigmas;
+ }
+
+ template<int otherdim>
+ std::tuple<cv::Vec<float,otherdim> , cv::Matx<float,otherdim,otherdim>> compute_statistics(const SigmaPoints<dim,otherdim> &sigmas) const
+ {
+ cv::Vec<float,otherdim> mu{}; // Zero initializes
+ for (size_t i=0; i<sigmas.size(); ++i)
+ {
+ mu += to_vec((i==0 ? Wm_0 : Wm_i) * sigmas[i]);
+ }
+
+ cv::Matx<float,otherdim,otherdim> cov{};
+ for (size_t i=0; i<sigmas.size(); ++i)
+ {
+ const auto p = sigmas[i] - mu;
+ cov += (i==0 ? Wc_0 : Wc_i)*p*p.t();
+ }
+
+ return { mu, cov };
+ }
+
+ template<int otherdim>
+ cv::Matx<float,dim,otherdim> compute_cov(const SigmaPoints<dim,dim> &sigmas, const SigmaPoints<dim,otherdim> &othersigmas) const
+ {
+ cv::Vec<float,dim> mu{}; // Zero initializes
+ cv::Vec<float,otherdim> mu_other{}; // Zero initializes
+ for (size_t i=0; i<sigmas.size(); ++i)
+ {
+ mu += to_vec((i==0 ? Wm_0 : Wm_i) * sigmas[i]);
+ mu_other += to_vec((i==0 ? Wm_0 : Wm_i) * othersigmas[i]);
+ }
+
+ cv::Matx<float,dim,otherdim> cov{};
+ for (size_t i=0; i<sigmas.size(); ++i)
+ {
+ const auto p = sigmas[i] - mu;
+ const auto q = othersigmas[i] - mu_other;
+ cov += (i==0 ? Wc_0 : Wc_i)*p*q.t();
+ }
+
+ return cov;
+ }
+private:
+ float Wc_i, Wm_i, Wm_0, Wc_0, lambda;
+};
+
+
+} // namespace ukf_cv \ No newline at end of file