diff options
author | Michael Welter <michael@welter-4d.de> | 2022-09-11 20:55:26 +0200 |
---|---|---|
committer | Stanislaw Halik <sthalik@misaki.pl> | 2022-11-01 13:51:35 +0100 |
commit | 08f1fcad1c74e25f97641a0ccbd229b267ec528c (patch) | |
tree | 000b1b276bc7df4a74fd493dab05bcce68801de8 /tracker-neuralnet | |
parent | 77d6abaf53dbe2ee6334bd59b112e25d694a2f65 (diff) |
tracker/nn: Tweaks, refactoring, a deadzone filtering and support for uncertainty estimation
* Add rudimentary test for two functions .. maybe more in future
* Fix the rotation correction from vertical translation
* Move preview class to new files
* Move neural network model adapters to new files
* Add utility functions for opencv
* Query the model inputs/outputs by name to see what is available
* Supports outputs for standard deviation of the data distribution -
What you get if you let your model output the full parameters of a
gaussian distribution (depending on the inputs) and fit it with
negative log likelihood loss.
* Disabled support for sequence models
* Add support for detection of eye open/close classification.
Scale uncertainty estimate up if eyes closed
* Add a deadzone filter which activates if the model supports uncertainty
quantification. The deadzone scales becomes larger the more uncertain
the model/data are. This is mostly supposed to be useful to suppress
large estimate errors when the user blinks with the eyes
* Fix distance being twice of what it should have been
Diffstat (limited to 'tracker-neuralnet')
-rw-r--r-- | tracker-neuralnet/deadzone_filter.cpp | 162 | ||||
-rw-r--r-- | tracker-neuralnet/deadzone_filter.h | 37 | ||||
-rw-r--r-- | tracker-neuralnet/ftnoir_tracker_neuralnet.cpp | 822 | ||||
-rw-r--r-- | tracker-neuralnet/ftnoir_tracker_neuralnet.h | 127 | ||||
-rw-r--r-- | tracker-neuralnet/model_adapters.cpp | 433 | ||||
-rw-r--r-- | tracker-neuralnet/model_adapters.h | 102 | ||||
-rw-r--r-- | tracker-neuralnet/opencv_contrib.h | 120 | ||||
-rw-r--r-- | tracker-neuralnet/preview.cpp | 135 | ||||
-rw-r--r-- | tracker-neuralnet/preview.h | 60 | ||||
-rw-r--r-- | tracker-neuralnet/tests.cpp | 58 | ||||
-rw-r--r-- | tracker-neuralnet/unscented_trafo.h | 132 |
11 files changed, 1441 insertions, 747 deletions
diff --git a/tracker-neuralnet/deadzone_filter.cpp b/tracker-neuralnet/deadzone_filter.cpp new file mode 100644 index 00000000..b41afdba --- /dev/null +++ b/tracker-neuralnet/deadzone_filter.cpp @@ -0,0 +1,162 @@ +#include "deadzone_filter.h" +#include "model_adapters.h" +#include "opencv_contrib.h" +#include "unscented_trafo.h" + +#include <opencv2/core/base.hpp> +#include <opencv2/core/matx.hpp> +#include <opencv2/core/quaternion.hpp> + +namespace neuralnet_tracker_ns +{ + +using namespace cvcontrib; + +using StateVec = cv::Vec<float,6>; +using StateCov = cv::Matx<float,6,6>; + +static constexpr int num_sigmas = ukf_cv::MerweScaledSigmaPoints<6>::num_sigmas; +static constexpr float img_scale = 200.f; +static constexpr float world_scale = 1000.f; // mm + +StateCov make_tangent_space_uncertainty_tril(const PoseEstimator::Face &face) +{ + StateCov tril = StateCov::eye(); + tril(0,0) = face.center_stddev.x/img_scale; + tril(1,1) = face.center_stddev.y/img_scale; + tril(2,2) = face.size_stddev/img_scale; + set_minor<3,3>(tril, 3, 3, face.rotaxis_cov_tril); + return tril; +} + + +QuatPose apply_offset(const QuatPose& pose, const StateVec& offset) +{ + // Unpack + const cv::Vec3f dp = { offset[0], offset[1], offset[2] }; + const cv::Quatf dr = cv::Quatf::createFromRvec(cv::Vec3f{ offset[3], offset[4], offset[5] }); + const auto p = pose.pos + dp; + const auto r = pose.rot * dr; + return { r, p }; +} + + +PoseEstimator::Face apply_offset(const PoseEstimator::Face& face, const StateVec& offset) +{ + const cv::Quatf dr = cv::Quatf::createFromRvec(cv::Vec3f{ offset[3], offset[4], offset[5] }); + const auto r = face.rotation * dr; + + const cv::Point2f p = { + face.center.x + offset[0]*img_scale, + face.center.y + offset[1]*img_scale + }; + + // Intercept the case where the head size stddev became so large that the sigma points + // were created with negative head size (mean - constant*stddev ...). Negative head size + // is bad. But this is fine. The unscented transform where this function comes into play + // is designed to handle non-linearities like this. + const float sz = std::max(0.1f*face.size, face.size + offset[2]*img_scale); + + return PoseEstimator::Face{ + r, + {}, + {}, + p, + {}, + sz, + {} + }; +} + + +StateVec relative_to(const QuatPose& reference, const QuatPose& pose) +{ + const auto p = pose.pos - reference.pos; + const auto r = toRotVec(reference.rot.conjugate()*pose.rot); + return StateVec{ p[0], p[1], p[2], r[0], r[1], r[2] }; +} + + +ukf_cv::SigmaPoints<6> relative_to(const QuatPose& pose, const std::array<QuatPose,num_sigmas>& sigmas) +{ + ukf_cv::SigmaPoints<6> out; + std::transform(sigmas.begin(), sigmas.end(), out.begin(), [&pose](const QuatPose& s) { + return relative_to(pose, s); + }); + return out; +} + + +std::array<QuatPose,num_sigmas> compute_world_pose_from_sigma_point(const PoseEstimator::Face& face, const ukf_cv::SigmaPoints<6>& sigmas, Face2WorldFunction face2world) +{ + std::array<QuatPose,num_sigmas> out; + std::transform(sigmas.begin(), sigmas.end(), out.begin(), [face2world=std::move(face2world), &face](const StateVec& sigma_point) { + // First unpack the state vector and generate quaternion rotation w.r.t image space. + const auto sigma_face = apply_offset(face, sigma_point); + // Then transform ... + QuatPose pose = face2world(sigma_face.rotation, sigma_face.center, sigma_face.size); + pose.pos /= world_scale; + return pose; + }); + return out; +} + + +StateVec apply_filter_to_offset(const StateVec& offset, const StateCov& offset_cov, float, const FiltParams& params) +{ + // Offset and Cov represent a multivariate normal distribution, which is the probability of the new pose measured w.r.t the previous one. + // Prob(x) ~exp(-(x-mu)t Cov^-1 (x-mu)) + // We want to attenuate this offset, or zero it out completely, to obtain a deadzone-filter behaviour. The size of the deadzone shall be + // determined by the covariance projected to the offset direction like so: + // Take x = mu - mu / |mu| * alpha + // p(alpha) ~exp(-alpha^2 / |mu|^2 * mut Cov^-1 mu) = ~exp(-alpha^2 / sigma^2) with sigma^2 = mut Cov^-1 mu / |mu|^2. + // So this projection is like a 1d normal distribution with some standard deviation, which we take to scale the deadzone. + + bool ok = true; + + const float len_div_sigma_sqr = offset.dot(offset_cov.inv(cv::DECOMP_CHOLESKY, &ok) * offset); + + const float attenuation = (ok) ? sigmoid((std::sqrt(len_div_sigma_sqr) - params.deadzone_size)*params.deadzone_hardness) : 1.f; + + // { + // std::cout << "cov diag: " << offset_cov.diag() << std::endl; + // std::cout << "offset: " << cv::norm(offset) << std::endl; + // std::cout << "len_div_sigma_sqr: " << cv::norm(len_div_sigma_sqr) << std::endl; + // std::cout << "attenuation (" << ok << "): " << attenuation << std::endl; + // } + + return offset*attenuation; +} + + +QuatPose apply_filter(const PoseEstimator::Face &face, const QuatPose& previous_pose_, float dt, Face2WorldFunction face2world, const FiltParams& params) +{ + ukf_cv::MerweScaledSigmaPoints<6> unscentedtrafo; + auto previous_pose = previous_pose_; + previous_pose.pos /= world_scale; + + // Here we have the covariance matrix for the offset from the observed values in `face`. + const auto cov_tril = make_tangent_space_uncertainty_tril(face); + + // The filter uses an unscented transform to translate that into a distribution for the offset from the previous pose. + const ukf_cv::SigmaPoints<6> sigmas = unscentedtrafo.compute_sigmas(to_vec(StateVec::zeros()), cov_tril, true); + + // We have many of these sigma points. This is why that callback comes into play here. + const std::array<QuatPose,num_sigmas> pose_sigmas = compute_world_pose_from_sigma_point(face, sigmas, std::move(face2world)); + + const ukf_cv::SigmaPoints<6> deltas_sigmas = relative_to(previous_pose, pose_sigmas); + + const auto [offset, offset_cov] = unscentedtrafo.compute_statistics(deltas_sigmas); + + // Then the deadzone is applied to the offset and finally the previous pose is transformed by the offset to arrive at the final output. + const StateVec scaled_offset = apply_filter_to_offset(offset, offset_cov, dt, params); + + QuatPose new_pose = apply_offset(previous_pose, scaled_offset); + + new_pose.pos *= world_scale; + + return new_pose; +} + + +} // namespace neuralnet_tracker_ns
\ No newline at end of file diff --git a/tracker-neuralnet/deadzone_filter.h b/tracker-neuralnet/deadzone_filter.h new file mode 100644 index 00000000..a9b6aada --- /dev/null +++ b/tracker-neuralnet/deadzone_filter.h @@ -0,0 +1,37 @@ +#pragma once + +#include "unscented_trafo.h" +#include "opencv_contrib.h" +#include "model_adapters.h" + +namespace neuralnet_tracker_ns +{ + +/// Represents a 6d pose by quaternion rotation and position vector. +struct QuatPose { + cv::Quatf rot; + cv::Vec3f pos; +}; + +struct FiltParams +{ + float deadzone_hardness = 1.f; + float deadzone_size = 1.f; +}; + +/** Callback type for converting data from the `Face` struct to a 6d pose. +* +* This callback is needed because it depends on things that the filter doesn't have to know about and it is called multiple times +* due to the way how uncertainty estimates are handled +*/ +using Face2WorldFunction = std::function<QuatPose (const cv::Quatf&, const cv::Point2f&, float)>; + +/** Applies a deadzone filter similar to the one used in the Hamilton filter. +* +* What sets this apart is that the deadzone size scales with the uncertainty estimate of the network. +* The rotation uncertainty is represented by a covariance matrix for the distribution of a rotation vector which +* describes the offset from the mean rotation (the quaternion in the `Face` struct). +*/ +QuatPose apply_filter(const PoseEstimator::Face &face, const QuatPose& previous_pose, float dt, Face2WorldFunction face2world, const FiltParams& params); + +} // namespace neuralnet_tracker_ns
\ No newline at end of file diff --git a/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp b/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp index 62209978..b56b4e96 100644 --- a/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp +++ b/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp @@ -6,17 +6,19 @@ */ #include "ftnoir_tracker_neuralnet.h" +#include "deadzone_filter.h" +#include "opencv_contrib.h" + #include "compat/sleep.hpp" #include "compat/math-imports.hpp" -#include "cv/init.hpp" -#include <opencv2/core.hpp> -#include <opencv2/core/hal/interface.h> -#include <opencv2/core/types.hpp> -#include <opencv2/calib3d.hpp> -#include <opencv2/imgcodecs.hpp> #include "compat/timer.hpp" #include "compat/check-visible.hpp" +#include "cv/init.hpp" + #include <omp.h> +#include <onnxruntime_cxx_api.h> +#include <opencv2/core.hpp> +#include <opencv2/core/quaternion.hpp> #ifdef _MSC_VER # pragma warning(disable : 4702) @@ -32,7 +34,7 @@ #include <chrono> #include <string> #include <stdexcept> - +#include <unordered_map> // Some demo code for onnx // https://github.com/microsoft/onnxruntime/blob/master/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/C_Api_Sample.cpp @@ -41,12 +43,11 @@ namespace neuralnet_tracker_ns { +using namespace cvcontrib; using numeric_types::vec3; using numeric_types::vec2; using numeric_types::mat33; -using quat = std::array<numeric_types::f,4>; - #if _MSC_VER std::wstring convert(const QString &s) { return s.toStdWString(); } @@ -55,6 +56,18 @@ std::string convert(const QString &s) { return s.toStdString(); } #endif +int enum_to_fps(int value) +{ + switch (value) + { + case fps_30: return 30; + case fps_60: return 60; + default: [[fallthrough]]; + case fps_default: return 0; + } +} + + template<class F> struct OnScopeExit { @@ -67,28 +80,38 @@ struct OnScopeExit }; -float sigmoid(float x) +CamIntrinsics make_intrinsics(const cv::Mat& img, const Settings& settings) { - return 1.f/(1.f + std::exp(-x)); -} - + const int w = img.cols, h = img.rows; + const double diag_fov = settings.fov * M_PI / 180.; + const double fov_w = 2.*atan(tan(diag_fov/2.)/sqrt(1. + h/(double)w * h/(double)w)); + const double fov_h = 2.*atan(tan(diag_fov/2.)/sqrt(1. + w/(double)h * w/(double)h)); + const double focal_length_w = 1. / tan(.5 * fov_w); + const double focal_length_h = 1. / tan(.5 * fov_h); + /* a + ______ <--- here is sensor area + | / + | / + f | / + | / 2 x angle is the fov + |/ + <--- here is the hole of the pinhole camera + + So, a / f = tan(fov / 2) + => f = a/tan(fov/2) + What is a? + 1 if we define f in terms of clip space where the image plane goes from -1 to 1. Because a is the half-width. + */ -cv::Rect make_crop_rect_for_aspect(const cv::Size &size, int aspect_w, int aspect_h) -{ - auto [w, h] = size; - if ( w*aspect_h > aspect_w*h ) - { - // Image is too wide - const int new_w = (aspect_w*h)/aspect_h; - return cv::Rect((w - new_w)/2, 0, new_w, h); - } - else - { - const int new_h = (aspect_h*w)/aspect_w; - return cv::Rect(0, (h - new_h)/2, w, new_h); - } + return { + (float)focal_length_w, + (float)focal_length_h, + (float)fov_w, + (float)fov_h + }; } + cv::Rect make_crop_rect_multiple_of(const cv::Size &size, int multiple) { const int new_w = (size.width / multiple) * multiple; @@ -111,20 +134,6 @@ cv::Rect_<T> squarize(const cv::Rect_<T> &r) template<class T> -cv::Point_<T> as_point(const cv::Size_<T>& s) -{ - return { s.width, s.height }; -} - - -template<class T> -cv::Size_<T> as_size(const cv::Point_<T>& p) -{ - return { p.x, p.y }; -} - - -template<class T> cv::Rect_<T> expand(const cv::Rect_<T>& r, T factor) { // xnew = l+.5*w - w*f*0.5 = l + .5*(w - new_w) @@ -145,110 +154,40 @@ cv::Rect_<T> ewa_filter(const cv::Rect_<T>& last, const cv::Rect_<T>& current, T } -cv::Rect2f unnormalize(const cv::Rect2f &r, int h, int w) -{ - auto unnorm = [](float x) -> float { return 0.5*(x+1); }; - auto tl = r.tl(); - auto br = r.br(); - auto x0 = unnorm(tl.x)*w; - auto y0 = unnorm(tl.y)*h; - auto x1 = unnorm(br.x)*w; - auto y1 = unnorm(br.y)*h; - return { - x0, y0, x1-x0, y1-y0 - }; -} - -cv::Point2f normalize(const cv::Point2f &p, int h, int w) -{ - return { - p.x/w*2.f-1.f, - p.y/h*2.f-1.f - }; -} - - -mat33 rotation_from_two_vectors(const vec3 &a, const vec3 &b) -{ - vec3 axis = a.cross(b); - const float len_a = cv::norm(a); - const float len_b = cv::norm(b); - const float len_axis = cv::norm(axis); - const float sin_angle = std::clamp(len_axis / (len_a * len_b), -1.f, 1.f); - const float angle = std::asin(sin_angle); - axis *= angle/(1.e-12 + len_axis); - mat33 out; - cv::Rodrigues(axis, out); - return out; -} - - -// Computes correction due to head being off screen center. -// x, y: In screen space, i.e. in [-1,1] -// focal_length_x: In screen space -mat33 compute_rotation_correction(const cv::Point2f &p, float focal_length_x) -{ - return rotation_from_two_vectors( - {1.f,0.f,0.f}, - {focal_length_x, p.y, p.x}); -} - - -mat33 quaternion_to_mat33(const std::array<float,4> quat) -{ - mat33 m; - const float w = quat[0]; - const float i = quat[1]; - const float j = quat[2]; - const float k = quat[3]; - m(0,0) = 1.f - 2.f*(j*j + k*k); - m(1,0) = 2.f*(i*j + k*w); - m(2,0) = 2.f*(i*k - j*w); - m(0,1) = 2.f*(i*j - k*w); - m(1,1) = 1.f - 2.f*(i*i + k*k); - m(2,1) = 2.f*(j*k + i*w); - m(0,2) = 2.f*(i*k + j*w); - m(1,2) = 2.f*(j*k - i*w); - m(2,2) = 1.f - 2.f*(i*i + j*j); - return m; -} - - -vec3 rotate_vec(const quat& q, const vec3& p) +cv::Vec3f image_to_world(float x, float y, float size, float reference_size_in_mm, const cv::Size2i& image_size, const CamIntrinsics& intrinsics) { - const float qw = q[0]; - const float qi = q[1]; - const float qj = q[2]; - const float qk = q[3]; - const float pi = p[0]; - const float pj = p[1]; - const float pk = p[2]; - const quat tmp{ - - qi*pi - qj*pj - qk*pk, - qw*pi + qj*pk - qk*pj, - qw*pj - qi*pk + qk*pi, - qw*pk + qi*pj - qj*pi - }; - const vec3 out { - -tmp[0]*qi + tmp[1]*qw - tmp[2]*qk + tmp[3]*qj, - -tmp[0]*qj + tmp[1]*qk + tmp[2]*qw - tmp[3]*qi, - -tmp[0]*qk - tmp[1]*qj + tmp[2]*qi + tmp[3]*qw - }; - return out; -} - - -vec3 image_to_world(float x, float y, float size, float reference_size_in_mm, const cv::Size2i& image_size, const CamIntrinsics& intrinsics) -{ - // Compute the location the network outputs in 3d space. - const float xpos = -(intrinsics.focal_length_w * image_size.width * 0.5f) / size * reference_size_in_mm; + /* + Compute the location the network outputs in 3d space. + + hhhhhh <- head size (meters) + \ | ----------------------- + \ | \ + \ | | + \ | |- x (meters) + ____ <- face.size / width | + \ | | | + \| |- focal length / + ------------------------ + ------------------------------------------------>> z direction + z/x = zi / f + zi = image position + z = world position + f = focal length + + We can also do deltas: + dz / x = dzi / f + => x = dz / dzi * f + which means we can compute x from the head size (dzi) if we assume some reference size (dz). + */ + const float head_size_vertical = 2.f*size; // Size from the model is more like half the real vertical size of a human head. + const float xpos = -(intrinsics.focal_length_w * image_size.width * 0.5f) / head_size_vertical * reference_size_in_mm; const float zpos = (x / image_size.width * 2.f - 1.f) * xpos / intrinsics.focal_length_w; const float ypos = (y / image_size.height * 2.f - 1.f) * xpos / intrinsics.focal_length_h; return {xpos, ypos, zpos}; } -vec2 world_to_image(const vec3& pos, const cv::Size2i& image_size, const CamIntrinsics& intrinsics) +vec2 world_to_image(const cv::Vec3f& pos, const cv::Size2i& image_size, const CamIntrinsics& intrinsics) { const float xscr = pos[2] / pos[0] * intrinsics.focal_length_w; const float yscr = pos[1] / pos[0] * intrinsics.focal_length_h; @@ -258,7 +197,7 @@ vec2 world_to_image(const vec3& pos, const cv::Size2i& image_size, const CamIntr } -quat image_to_world(quat q) +cv::Quatf image_to_world(cv::Quatf q) { std::swap(q[1], q[3]); q[1] = -q[1]; @@ -267,338 +206,73 @@ quat image_to_world(quat q) return q; } -quat world_to_image(quat q) -{ - // It's its own inverse. - return image_to_world(q); -} - -// Intersection over union. A value between 0 and 1 which measures the match between the bounding boxes. -template<class T> -T iou(const cv::Rect_<T> &a, const cv::Rect_<T> &b) -{ - auto i = a & b; - return double{i.area()} / (a.area()+b.area()-i.area()); -} - -// Returns width and height of the input tensor, or throws. -// Expects the model to take one tensor as input that must -// have the shape B x C x H x W, where B=C=1. -cv::Size get_input_image_shape(const Ort::Session &session) -{ - if (session.GetInputCount() < 1) - throw std::invalid_argument("Model must take at least one input tensor"); - const std::vector<std::int64_t> shape = - session.GetInputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape(); - if (shape.size() != 4) - throw std::invalid_argument("Model takes the input tensor in the wrong shape"); - return { static_cast<int>(shape[3]), static_cast<int>(shape[2]) }; -} - -Ort::Value create_tensor(const Ort::TypeInfo& info, Ort::Allocator& alloc) +cv::Point2f normalize(const cv::Point2f &p, int h, int w) { - const auto shape = info.GetTensorTypeAndShapeInfo().GetShape(); - auto t = Ort::Value::CreateTensor<float>( - alloc, shape.data(), shape.size()); - memset(t.GetTensorMutableData<float>(), 0, sizeof(float)*info.GetTensorTypeAndShapeInfo().GetElementCount()); - return t; + return { + p.x/w*2.f-1.f, + p.y/h*2.f-1.f + }; } - -int enum_to_fps(int value) +cv::Quatf rotation_from_two_vectors(const vec3 &a, const vec3 &b) { - switch (value) + // |axis| = |a| * |b| * sin(alpha) + const vec3 axis = a.cross(b); + // dot = |a|*|b|*cos(alpha) + const float dot = a.dot(b); + const float len = cv::norm(axis); + vec3 normed_axis = axis / len; + float angle = std::atan2(len, dot); + if (!(std::isfinite(normed_axis[0]) && std::isfinite(normed_axis[1]) && std::isfinite(normed_axis[2]))) { - case fps_30: return 30; - case fps_60: return 60; - default: [[fallthrough]]; - case fps_default: return 0; + angle = 0.f; + normed_axis = vec3{1.,0.,0.}; } + return cv::Quatf::createFromAngleAxis(angle, normed_axis); } -Localizer::Localizer(Ort::MemoryInfo &allocator_info, Ort::Session &&session) : - session_{std::move(session)}, - scaled_frame_(INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH, CV_8U), - input_mat_(INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH, CV_32F) -{ - // Only works when input_mat does not reallocated memory ...which it should not. - // Non-owning memory reference to input_mat? - // Note: shape = (bach x channels x h x w) - const std::int64_t input_shape[4] = { 1, 1, INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH }; - input_val_ = Ort::Value::CreateTensor<float>(allocator_info, input_mat_.ptr<float>(0), input_mat_.total(), input_shape, 4); - - const std::int64_t output_shape[2] = { 1, 5 }; - output_val_ = Ort::Value::CreateTensor<float>(allocator_info, results_.data(), results_.size(), output_shape, 2); -} - - -std::pair<float, cv::Rect2f> Localizer::run( - const cv::Mat &frame) -{ - auto p = input_mat_.ptr(0); - - cv::resize(frame, scaled_frame_, { INPUT_IMG_WIDTH, INPUT_IMG_HEIGHT }, 0, 0, cv::INTER_AREA); - scaled_frame_.convertTo(input_mat_, CV_32F, 1./255., -0.5); - - assert (input_mat_.ptr(0) == p); - assert (!input_mat_.empty() && input_mat_.isContinuous()); - assert (input_mat_.cols == INPUT_IMG_WIDTH && input_mat_.rows == INPUT_IMG_HEIGHT); - - const char* input_names[] = {"x"}; - const char* output_names[] = {"logit_box"}; - - Timer t; t.start(); - - session_.Run(Ort::RunOptions{nullptr}, input_names, &input_val_, 1, output_names, &output_val_, 1); - - last_inference_time_ = t.elapsed_ms(); - - const cv::Rect2f roi = unnormalize(cv::Rect2f{ - results_[1], - results_[2], - results_[3]-results_[1], // Width - results_[4]-results_[2] // Height - }, frame.rows, frame.cols); - const float score = sigmoid(results_[0]); - - return { score, roi }; -} - - -double Localizer::last_inference_time_millis() const +// Computes correction due to head being off screen center. +cv::Quatf compute_rotation_correction(const cv::Point3f& p) { - return last_inference_time_; + return rotation_from_two_vectors( + {-1.f,0.f,0.f}, p); } -PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&_session) - : model_version_{_session.GetModelMetadata().GetVersion()} - , session_{std::move(_session)} - , allocator_{session_, allocator_info} +// Intersection over union. A value between 0 and 1 which measures the match between the bounding boxes. +template<class T> +T iou(const cv::Rect_<T> &a, const cv::Rect_<T> &b) { - using namespace std::literals::string_literals; - - if (session_.GetOutputCount() < 2) - throw std::runtime_error("Invalid Model: must have at least two outputs"); - - // WARNING UB .. but still ... - // If the model was saved without meta data, it seems the version field is uninitialized. - // In that case reading from it is UB. However, we will just get same arbitrary number - // which is hopefully different from the numbers used by models where the version is set. - // I.e., this is what happended in practice so far. - if (model_version_ != 2) - model_version_ = 1; - - const cv::Size input_image_shape = get_input_image_shape(session_); - - scaled_frame_ = cv::Mat(input_image_shape, CV_8U, cv::Scalar(0)); - input_mat_ = cv::Mat(input_image_shape, CV_32F, cv::Scalar(0.f)); - - { - const std::int64_t input_shape[4] = { 1, 1, input_image_shape.height, input_image_shape.width }; - input_val_.push_back( - Ort::Value::CreateTensor<float>(allocator_info, input_mat_.ptr<float>(0), input_mat_.total(), input_shape, 4)); - } - - { - const std::int64_t output_shape[2] = { 1, 3 }; - output_val_.push_back(Ort::Value::CreateTensor<float>( - allocator_info, &output_coord_[0], output_coord_.rows, output_shape, 2)); - } - - { - const std::int64_t output_shape[2] = { 1, 4 }; - output_val_.push_back(Ort::Value::CreateTensor<float>( - allocator_info, &output_quat_[0], output_quat_.rows, output_shape, 2)); - } - - size_t num_regular_outputs = 2; - - if (session_.GetOutputCount() >= 3 && "box"s == session_.GetOutputName(2, allocator_)) - { - const std::int64_t output_shape[2] = { 1, 4 }; - output_val_.push_back(Ort::Value::CreateTensor<float>( - allocator_info, &output_box_[0], output_box_.rows, output_shape, 2)); - ++num_regular_outputs; - qDebug() << "Note: Legacy model output for face ROI is currently ignored"; - } - - num_recurrent_states_ = session_.GetInputCount()-1; - if (session_.GetOutputCount()-num_regular_outputs != num_recurrent_states_) - throw std::runtime_error("Invalid Model: After regular inputs and outputs the model must have equal number of inputs and outputs for tensors holding hidden states of recurrent layers."); - - // Create tensors for recurrent state - for (size_t i = 0; i < num_recurrent_states_; ++i) - { - const auto& input_info = session_.GetInputTypeInfo(1+i); - const auto& output_info = session_.GetOutputTypeInfo(num_regular_outputs+i); - if (input_info.GetTensorTypeAndShapeInfo().GetShape() != - output_info.GetTensorTypeAndShapeInfo().GetShape()) - throw std::runtime_error("Invalid Model: Tensors for recurrent hidden states should have same shape on intput and output"); - input_val_.push_back(create_tensor(input_info, allocator_)); - output_val_.push_back(create_tensor(output_info, allocator_)); - } - - for (size_t i = 0; i < session_.GetInputCount(); ++i) - { - input_names_.push_back(session_.GetInputName(i, allocator_)); - } - for (size_t i = 0; i < session_.GetOutputCount(); ++i) - { - output_names_.push_back(session_.GetOutputName(i, allocator_)); - } - - qDebug() << "Model inputs: " << session_.GetInputCount() << ", outputs: " << session_.GetOutputCount() << ", recurrent states: " << num_recurrent_states_; - - assert (input_names_.size() == input_val_.size()); - assert (output_names_.size() == output_val_.size()); + auto i = a & b; + return double{i.area()} / (a.area()+b.area()-i.area()); } -int PoseEstimator::find_input_intensity_90_pct_quantile() const +class GuardedThreadCountSwitch { - const int channels[] = { 0 }; - const int hist_size[] = { 255 }; - float range[] = { 0, 256 }; - const float* ranges[] = { range }; - cv::Mat hist; - cv::calcHist(&scaled_frame_, 1, channels, cv::Mat(), hist, 1, hist_size, ranges, true, false); - int gray_level = 0; - const int num_pixels_quantile = scaled_frame_.total()*0.9f; - int num_pixels_accum = 0; - for (int i=0; i<hist_size[0]; ++i) - { - num_pixels_accum += hist.at<float>(i); - if (num_pixels_accum > num_pixels_quantile) + int old_num_threads_cv_ = 1; + int old_num_threads_omp_ = 1; + public: + GuardedThreadCountSwitch(int num_threads) { - gray_level = i; - break; + old_num_threads_cv_ = cv::getNumThreads(); + old_num_threads_omp_ = omp_get_num_threads(); + omp_set_num_threads(num_threads); + cv::setNumThreads(num_threads); + } + + ~GuardedThreadCountSwitch() + { + omp_set_num_threads(old_num_threads_omp_); + cv::setNumThreads(old_num_threads_cv_); } - } - return gray_level; -} - - -std::optional<PoseEstimator::Face> PoseEstimator::run( - const cv::Mat &frame, const cv::Rect &box) -{ - cv::Mat cropped; - - const int patch_size = std::max(box.width, box.height)*1.05; - const cv::Point2f patch_center = { - std::clamp<float>(box.x + 0.5f*box.width, 0.f, frame.cols), - std::clamp<float>(box.y + 0.5f*box.height, 0.f, frame.rows) - }; - cv::getRectSubPix(frame, {patch_size, patch_size}, patch_center, cropped); - - // Will get failure if patch_center is outside image boundariesettings. - // Have to catch this case. - if (cropped.rows != patch_size || cropped.cols != patch_size) - return {}; - - auto p = input_mat_.ptr(0); - - cv::resize(cropped, scaled_frame_, scaled_frame_.size(), 0, 0, cv::INTER_AREA); - - // Automatic brightness amplification. - const int brightness = find_input_intensity_90_pct_quantile(); - const double alpha = brightness<127 ? 0.5/std::max(5,brightness) : 1./255; - const double beta = -0.5; - - scaled_frame_.convertTo(input_mat_, CV_32F, alpha, beta); - - assert (input_mat_.ptr(0) == p); - assert (!input_mat_.empty() && input_mat_.isContinuous()); - - - Timer t; t.start(); - - try - { - session_.Run( - Ort::RunOptions{ nullptr }, - input_names_.data(), - input_val_.data(), - input_val_.size(), - output_names_.data(), - output_val_.data(), - output_val_.size()); - } - catch (const Ort::Exception &e) - { - qDebug() << "Failed to run the model: " << e.what(); - return {}; - } - - for (size_t i = 0; i<num_recurrent_states_; ++i) - { - // Next step, the current output becomes the input. - // Thus we realize the recurrent connection. - // Only swaps the internal pointers. There is no copy of data. - std::swap( - output_val_[output_val_.size()-num_recurrent_states_+i], - input_val_[input_val_.size()-num_recurrent_states_+i]); - } - - // FIXME: Execution time fluctuates wildly. 19 to 26 msec. Why? - // The instructions are always the same. Maybe a memory allocation - // issue. The ONNX api suggests that tensor are allocated in an - // arena. Does that matter? Maybe the issue is something else? - - last_inference_time_ = t.elapsed_ms(); - - // Perform coordinate transformation. - // From patch-local normalized in [-1,1] to - // frame unnormalized pixel coordinatesettings. - - const cv::Point2f center = patch_center + - (0.5f*patch_size)*cv::Point2f{output_coord_[0], output_coord_[1]}; - - const float size = patch_size*0.5f*output_coord_[2]; - - // Following Eigen which uses quat components in the order w, x, y, z. - quat rotation = { - output_quat_[3], - output_quat_[0], - output_quat_[1], - output_quat_[2] }; - - if (model_version_ < 2) - { - // Due to a change in coordinate conventions - rotation = world_to_image(rotation); - } - - const cv::Rect2f outbox = { - patch_center.x + (0.5f*patch_size)*output_box_[0], - patch_center.y + (0.5f*patch_size)*output_box_[1], - 0.5f*patch_size*(output_box_[2]-output_box_[0]), - 0.5f*patch_size*(output_box_[3]-output_box_[1]) - }; - - return std::optional<Face>({ - rotation, outbox, center, size - }); -} - - -cv::Mat PoseEstimator::last_network_input() const -{ - assert(!input_mat_.empty()); - cv::Mat ret; - input_mat_.convertTo(ret, CV_8U, 255., 127.); - cv::cvtColor(ret, ret, cv::COLOR_GRAY2RGB); - return ret; -} - -double PoseEstimator::last_inference_time_millis() const -{ - return last_inference_time_; -} + GuardedThreadCountSwitch(const GuardedThreadCountSwitch&) = delete; + GuardedThreadCountSwitch& operator=(const GuardedThreadCountSwitch&) = delete; +}; bool NeuralNetTracker::detect() @@ -645,6 +319,8 @@ bool NeuralNetTracker::detect() if (!last_roi_) { + // Last iteration the tracker failed to generate a trustworthy + // roi and the localizer also cannot find a face. draw_gizmos({}, {}); return false; } @@ -655,37 +331,28 @@ bool NeuralNetTracker::detect() if (!face) { last_roi_.reset(); - draw_gizmos(*face, {}); + draw_gizmos({}, {}); return false; } - { - // Here: compute ROI from head size estimate. This helps make the size prediction more - // invariant to mouth opening. The tracking can be lost more often at extreme - // rotations, depending on the implementation details. The code down here has - // been tweaked so that it works pretty well. - // In old behaviour ROI is taken from the model outputs - const vec3 offset = rotate_vec(face->rotation, vec3{0.f, 0.1f*face->size, face->size*0.3f}); - const float halfsize = face->size/float(settings_.roi_zoom); - face->box = cv::Rect2f( - face->center.x + offset[0] - halfsize, - face->center.y + offset[1] - halfsize, - halfsize*2.f, - halfsize*2.f - ); - } - - last_roi_ = ewa_filter(*last_roi_, face->box, float(settings_.roi_filter_alpha)); - - Affine pose = compute_pose(*face); + cv::Rect2f roi = expand(face->box, (float)settings_.roi_zoom); - draw_gizmos(*face, pose); + last_roi_ = ewa_filter(*last_roi_, roi, float(settings_.roi_filter_alpha)); + QuatPose pose = compute_filtered_pose(*face); + last_pose_ = pose; + + Affine pose_affine = { + pose.rot.toRotMat3x3(cv::QUAT_ASSUME_UNIT), + pose.pos }; + { QMutexLocker lck(&mtx_); - this->pose_ = pose; + last_pose_affine_ = pose_affine; } + draw_gizmos(*face, last_pose_affine_); + return true; } @@ -697,168 +364,74 @@ void NeuralNetTracker::draw_gizmos( if (!is_visible_) return; - preview_.draw_gizmos(face, pose, last_roi_, last_localizer_roi_, world_to_image(pose.t, grayscale_.size(), intrinsics_)); + preview_.draw_gizmos( + face, + last_roi_, + last_localizer_roi_, + world_to_image(pose.t, grayscale_.size(), intrinsics_)); if (settings_.show_network_input) { cv::Mat netinput = poseestimator_->last_network_input(); preview_.overlay_netinput(netinput); } - - //preview_.draw_fps(fps, last_inference_time); } -Affine NeuralNetTracker::compute_pose(const PoseEstimator::Face &face) const +QuatPose NeuralNetTracker::transform_to_world_pose(const cv::Quatf &face_rotation, const cv::Point2f& face_xy, const float face_size) const { - // Compute the location the network outputs in 3d space. - - const mat33 rot_correction = compute_rotation_correction( - normalize(face.center, grayscale_.rows, grayscale_.cols), - intrinsics_.focal_length_w); - - const mat33 m = rot_correction * quaternion_to_mat33( - image_to_world(face.rotation)); - - /* - - hhhhhh <- head size (meters) - \ | ----------------------- - \ | \ - \ | | - \ | |- tz (meters) - ____ <- face.size / width | - \ | | | - \| |- focal length / - ------------------------ - */ - const vec3 face_world_pos = image_to_world( - face.center.x, face.center.y, face.size, HEAD_SIZE_MM, + face_xy.x, face_xy.y, face_size, HEAD_SIZE_MM, grayscale_.size(), intrinsics_); + const cv::Quatf rot_correction = compute_rotation_correction( + face_world_pos); + + cv::Quatf rot = rot_correction * image_to_world(face_rotation); + // But this is in general not the location of the rotation joint in the neck. - // So we need an extra offset. Which we determine by solving + // So we need an extra offset. Which we determine by computing // z,y,z-pos = head_joint_loc + R_face * offset - const vec3 pos = face_world_pos - + m * vec3{ - static_cast<float>(settings_.offset_fwd), - static_cast<float>(settings_.offset_up), - static_cast<float>(settings_.offset_right)}; + const vec3 local_offset = vec3{ + static_cast<float>(settings_.offset_fwd), + static_cast<float>(settings_.offset_up), + static_cast<float>(settings_.offset_right)}; + const vec3 offset = rotate(rot, local_offset); + const vec3 pos = face_world_pos + offset; - return { m, pos }; + return { rot, pos }; } -void Preview::init(const cv_video_widget& widget) +QuatPose NeuralNetTracker::compute_filtered_pose(const PoseEstimator::Face &face) { - auto [w,h] = widget.preview_size(); - preview_size_ = { w, h }; -} - - -void Preview::copy_video_frame(const cv::Mat& frame) -{ - cv::Rect roi = make_crop_rect_for_aspect(frame.size(), preview_size_.width, preview_size_.height); - - cv::resize(frame(roi), preview_image_, preview_size_, 0, 0, cv::INTER_NEAREST); - - offset_ = { (float)-roi.x, (float)-roi.y }; - scale_ = float(preview_image_.cols) / float(roi.width); -} - - -void Preview::draw_gizmos( - const std::optional<PoseEstimator::Face> &face, - const Affine& pose, - const std::optional<cv::Rect2f>& last_roi, - const std::optional<cv::Rect2f>& last_localizer_roi, - const cv::Point2f& neckjoint_position) -{ - if (preview_image_.empty()) - return; - - if (last_roi) + if (fps_ > 0.01 && last_pose_ && poseestimator_->has_uncertainty()) { - const int col = 255; - cv::rectangle(preview_image_, transform(*last_roi), cv::Scalar(0, col, 0), /*thickness=*/1); - } - if (last_localizer_roi) - { - const int col = 255; - cv::rectangle(preview_image_, transform(*last_localizer_roi), cv::Scalar(col, 0, 255-col), /*thickness=*/1); + auto image2world = [this](const cv::Quatf &face_rotation, const cv::Point2f& face_xy, const float face_size) { + return this->transform_to_world_pose(face_rotation, face_xy, face_size); }; + + return apply_filter( + face, + *last_pose_, + 1./fps_, + std::move(image2world), + FiltParams{ + float(settings_.deadzone_hardness), + float(settings_.deadzone_size) + }); } - - if (face) + else { - if (face->size>=1.f) - cv::circle(preview_image_, static_cast<cv::Point>(transform(face->center)), int(transform(face->size)), cv::Scalar(255,255,255), 2); - cv::circle(preview_image_, static_cast<cv::Point>(transform(face->center)), 3, cv::Scalar(255,255,255), -1); - - auto draw_coord_line = [&](int i, const cv::Scalar& color) - { - const float vx = -pose.R(2,i); - const float vy = -pose.R(1,i); - static constexpr float len = 100.f; - cv::Point q = face->center + len*cv::Point2f{vx, vy}; - cv::line(preview_image_, static_cast<cv::Point>(transform(face->center)), static_cast<cv::Point>(transform(q)), color, 2); - }; - draw_coord_line(0, {0, 0, 255}); - draw_coord_line(1, {0, 255, 0}); - draw_coord_line(2, {255, 0, 0}); - - // Draw the computed joint position - auto xy = transform(neckjoint_position); - cv::circle(preview_image_, cv::Point(xy.x,xy.y), 5, cv::Scalar(0,0,255), -1); + return transform_to_world_pose(face.rotation, face.center, face.size); } } -void Preview::overlay_netinput(const cv::Mat& netinput) -{ - if (netinput.empty()) - return; - - const int w = std::min(netinput.cols, preview_image_.cols); - const int h = std::min(netinput.rows, preview_image_.rows); - cv::Rect roi(0, 0, w, h); - netinput(roi).copyTo(preview_image_(roi)); -} - -void Preview::draw_fps(double fps, double last_inference_time) -{ - char buf[128]; - ::snprintf(buf, sizeof(buf), "%d Hz, pose inference: %d ms", std::clamp(int(fps), 0, 9999), int(last_inference_time)); - cv::putText(preview_image_, buf, cv::Point(10, preview_image_.rows-10), cv::FONT_HERSHEY_PLAIN, 1, cv::Scalar(0, 255, 0), 1); -} - - -void Preview::copy_to_widget(cv_video_widget& widget) -{ - if (preview_image_.rows > 0) - widget.update_image(preview_image_); -} - - -cv::Rect2f Preview::transform(const cv::Rect2f& r) const -{ - return { (r.x - offset_.x)*scale_, (r.y - offset_.y)*scale_, r.width*scale_, r.height*scale_ }; -} - -cv::Point2f Preview::transform(const cv::Point2f& p) const -{ - return { (p.x - offset_.x)*scale_ , (p.y - offset_.y)*scale_ }; -} - -float Preview::transform(float s) const -{ - return s * scale_; -} - NeuralNetTracker::NeuralNetTracker() { opencv_init(); + neuralnet_tracker_tests::run(); } @@ -921,6 +494,7 @@ bool NeuralNetTracker::load_and_initialize_model() << e.what(); return false; } + return true; } @@ -960,46 +534,6 @@ bool NeuralNetTracker::open_camera() } -void NeuralNetTracker::set_intrinsics() -{ - const int w = grayscale_.cols, h = grayscale_.rows; - const double diag_fov = settings_.fov * M_PI / 180.; - const double fov_w = 2.*atan(tan(diag_fov/2.)/sqrt(1. + h/(double)w * h/(double)w)); - const double fov_h = 2.*atan(tan(diag_fov/2.)/sqrt(1. + w/(double)h * w/(double)h)); - const double focal_length_w = 1. / tan(.5 * fov_w); - const double focal_length_h = 1. / tan(.5 * fov_h); - - intrinsics_.fov_h = fov_h; - intrinsics_.fov_w = fov_w; - intrinsics_.focal_length_w = focal_length_w; - intrinsics_.focal_length_h = focal_length_h; -} - - -class GuardedThreadCountSwitch -{ - int old_num_threads_cv_ = 1; - int old_num_threads_omp_ = 1; - public: - GuardedThreadCountSwitch(int num_threads) - { - old_num_threads_cv_ = cv::getNumThreads(); - old_num_threads_omp_ = omp_get_num_threads(); - omp_set_num_threads(num_threads); - cv::setNumThreads(num_threads); - } - - ~GuardedThreadCountSwitch() - { - omp_set_num_threads(old_num_threads_omp_); - cv::setNumThreads(old_num_threads_cv_); - } - - GuardedThreadCountSwitch(const GuardedThreadCountSwitch&) = delete; - GuardedThreadCountSwitch& operator=(const GuardedThreadCountSwitch&) = delete; -}; - - void NeuralNetTracker::run() { preview_.init(*video_widget_); @@ -1055,7 +589,7 @@ void NeuralNetTracker::run() } } - set_intrinsics(); + intrinsics_ = make_intrinsics(grayscale_, settings_); detect(); @@ -1113,7 +647,7 @@ void NeuralNetTracker::data(double *data) Affine tmp = [&]() { QMutexLocker lck(&mtx_); - return pose_; + return last_pose_affine_; }(); const auto& mx = tmp.R.col(0); @@ -1140,7 +674,7 @@ void NeuralNetTracker::data(double *data) Affine NeuralNetTracker::pose() { QMutexLocker lck(&mtx_); - return pose_; + return last_pose_affine_; } std::tuple<cv::Size,double, double> NeuralNetTracker::stats() const diff --git a/tracker-neuralnet/ftnoir_tracker_neuralnet.h b/tracker-neuralnet/ftnoir_tracker_neuralnet.h index 9b481186..9e0374da 100644 --- a/tracker-neuralnet/ftnoir_tracker_neuralnet.h +++ b/tracker-neuralnet/ftnoir_tracker_neuralnet.h @@ -7,6 +7,11 @@ #pragma once +#include "ui_neuralnet-trackercontrols.h" +#include "model_adapters.h" +#include "deadzone_filter.h" +#include "preview.h" + #include "options/options.hpp" #include "api/plugin-api.hpp" #include "cv/video-widget.hpp" @@ -27,13 +32,9 @@ #include <cinttypes> #include <array> -#include <onnxruntime_cxx_api.h> - #include <opencv2/core.hpp> -#include <opencv2/core/types.hpp> #include <opencv2/imgproc.hpp> -#include "ui_neuralnet-trackercontrols.h" namespace neuralnet_tracker_ns { @@ -81,6 +82,8 @@ struct Settings : opts { value<bool> use_mjpeg { b, "use-mjpeg", false }; value<int> num_threads { b, "num-threads", 1 }; value<int> resolution { b, "force-resolution", 0 }; + value<double> deadzone_size { b, "deadzone-size", 1. }; + value<double> deadzone_hardness { b, "deadzone-hardness", 1.5 }; Settings(); }; @@ -94,101 +97,6 @@ struct CamIntrinsics }; -class Localizer -{ - public: - Localizer(Ort::MemoryInfo &allocator_info, - Ort::Session &&session); - - // Returns bounding wrt image coordinate of the input image - // The preceeding float is the score for being a face normalized to [0,1]. - std::pair<float, cv::Rect2f> run( - const cv::Mat &frame); - - double last_inference_time_millis() const; - private: - inline static constexpr int INPUT_IMG_WIDTH = 288; - inline static constexpr int INPUT_IMG_HEIGHT = 224; - Ort::Session session_{nullptr}; - // Inputs / outputs - cv::Mat scaled_frame_{}, input_mat_{}; - Ort::Value input_val_{nullptr}, output_val_{nullptr}; - std::array<float, 5> results_; - double last_inference_time_ = 0; -}; - - -class PoseEstimator -{ - public: - struct Face - { - std::array<float,4> rotation; // Quaternion, (w, x, y, z) - cv::Rect2f box; - cv::Point2f center; - float size; - }; - - PoseEstimator(Ort::MemoryInfo &allocator_info, - Ort::Session &&session); - /** Inference - * - * Coordinates are defined wrt. the image space of the input `frame`. - * X goes right, Z (depth) into the image, Y points down (like pixel coordinates values increase from top to bottom) - */ - std::optional<Face> run(const cv::Mat &frame, const cv::Rect &box); - // Returns an image compatible with the 'frame' image for displaying. - cv::Mat last_network_input() const; - double last_inference_time_millis() const; - private: - // Operates on the private image data members - int find_input_intensity_90_pct_quantile() const; - - int64_t model_version_ = 0; // Queried meta data from the ONNX file - Ort::Session session_{nullptr}; // ONNX's runtime context for running the model - Ort::Allocator allocator_; // Memory allocator for tensors - // Inputs - cv::Mat scaled_frame_{}, input_mat_{}; // Input. One is the original crop, the other is rescaled (?) - std::vector<Ort::Value> input_val_; // Tensors to put into the model - std::vector<const char*> input_names_; // Refers to the names in the onnx model. - // Outputs - cv::Vec<float, 3> output_coord_{}; // 2d Coordinate and head size output. - cv::Vec<float, 4> output_quat_{}; // Quaternion output - cv::Vec<float, 4> output_box_{}; // Bounding box output - std::vector<Ort::Value> output_val_; // Tensors to put the model outputs in. - std::vector<const char*> output_names_; // Refers to the names in the onnx model. - size_t num_recurrent_states_ = 0; - double last_inference_time_ = 0; -}; - - -class Preview -{ -public: - void init(const cv_video_widget& widget); - void copy_video_frame(const cv::Mat& frame); - void draw_gizmos( - const std::optional<PoseEstimator::Face> &face, - const Affine& pose, - const std::optional<cv::Rect2f>& last_roi, - const std::optional<cv::Rect2f>& last_localizer_roi, - const cv::Point2f& neckjoint_position); - void overlay_netinput(const cv::Mat& netinput); - void draw_fps(double fps, double last_inference_time); - void copy_to_widget(cv_video_widget& widget); -private: - // Transform from camera frame to preview - cv::Rect2f transform(const cv::Rect2f& r) const; - cv::Point2f transform(const cv::Point2f& p) const; - float transform(float s) const; - - cv::Mat preview_image_; - cv::Size preview_size_ = { 0, 0 }; - float scale_ = 1.f; - cv::Point2f offset_ = { 0.f, 0.f}; -}; - - class NeuralNetTracker : protected virtual QThread, public ITracker { Q_OBJECT @@ -214,7 +122,10 @@ private: const std::optional<PoseEstimator::Face> &face, const Affine& pose); void update_fps(double dt); - Affine compute_pose(const PoseEstimator::Face &face) const; + // Secretly applies filtering while computing the pose in 3d space. + QuatPose compute_filtered_pose(const PoseEstimator::Face &face); + // Compute the pose in 3d space taking the network outputs + QuatPose transform_to_world_pose(const cv::Quatf &face_rotation, const cv::Point2f& face_xy, const float face_size) const; Settings settings_; std::optional<Localizer> localizer_; @@ -227,7 +138,7 @@ private: std::array<cv::Mat,2> downsized_original_images_ = {}; // Image pyramid std::optional<cv::Rect2f> last_localizer_roi_; std::optional<cv::Rect2f> last_roi_; - static constexpr float HEAD_SIZE_MM = 200.f; + static constexpr float HEAD_SIZE_MM = 200.f; // In the vertical. Approximately. mutable QMutex stats_mtx_; double fps_ = 0; @@ -238,8 +149,9 @@ private: int num_threads_ = 1; bool is_visible_ = true; - QMutex mtx_; // Protects the pose - Affine pose_; + QMutex mtx_ = {}; // Protects the pose + std::optional<QuatPose> last_pose_ = {}; + Affine last_pose_affine_ = {}; Preview preview_; std::unique_ptr<cv_video_widget> video_widget_; @@ -288,6 +200,15 @@ class NeuralNetMetadata : public Metadata } // neuralnet_tracker_ns + +namespace neuralnet_tracker_tests +{ + +void run(); + +} + + using neuralnet_tracker_ns::NeuralNetTracker; using neuralnet_tracker_ns::NeuralNetDialog; using neuralnet_tracker_ns::NeuralNetMetadata; diff --git a/tracker-neuralnet/model_adapters.cpp b/tracker-neuralnet/model_adapters.cpp new file mode 100644 index 00000000..af599321 --- /dev/null +++ b/tracker-neuralnet/model_adapters.cpp @@ -0,0 +1,433 @@ +#include "model_adapters.h" + +#include "compat/timer.hpp" + +#include <opencv2/core.hpp> +#include <opencv2/core/quaternion.hpp> +#include <opencv2/imgproc.hpp> + +#include <QDebug> + + +namespace neuralnet_tracker_ns +{ + + +float sigmoid(float x) +{ + return 1.f/(1.f + std::exp(-x)); +} + + +// Defined in ftnoir_tracker_neuralnet.cpp +// Normally we wouldn't need it here. However ... see below. +cv::Quatf image_to_world(cv::Quatf q); + + +cv::Quatf world_to_image(cv::Quatf q) +{ + // It's its own inverse. + return image_to_world(q); +} + + +cv::Rect2f unnormalize(const cv::Rect2f &r, int h, int w) +{ + auto unnorm = [](float x) -> float { return 0.5*(x+1); }; + auto tl = r.tl(); + auto br = r.br(); + auto x0 = unnorm(tl.x)*w; + auto y0 = unnorm(tl.y)*h; + auto x1 = unnorm(br.x)*w; + auto y1 = unnorm(br.y)*h; + return { + x0, y0, x1-x0, y1-y0 + }; +} + + +// Returns width and height of the input tensor, or throws. +// Expects the model to take one tensor as input that must +// have the shape B x C x H x W, where B=C=1. +cv::Size get_input_image_shape(const Ort::Session &session) +{ + if (session.GetInputCount() < 1) + throw std::invalid_argument("Model must take at least one input tensor"); + const std::vector<std::int64_t> shape = + session.GetInputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape(); + if (shape.size() != 4) + throw std::invalid_argument("Model takes the input tensor in the wrong shape"); + return { static_cast<int>(shape[3]), static_cast<int>(shape[2]) }; +} + + +Ort::Value create_tensor(const Ort::TypeInfo& info, Ort::Allocator& alloc) +{ + const auto shape = info.GetTensorTypeAndShapeInfo().GetShape(); + auto t = Ort::Value::CreateTensor<float>( + alloc, shape.data(), shape.size()); + memset(t.GetTensorMutableData<float>(), 0, sizeof(float)*info.GetTensorTypeAndShapeInfo().GetElementCount()); + return t; +} + + +int find_input_intensity_quantile(const cv::Mat& frame, float percentage) +{ + const int channels[] = { 0 }; + const int hist_size[] = { 256 }; + float range[] = { 0, 256 }; + const float* ranges[] = { range }; + cv::Mat hist; + cv::calcHist(&frame, 1, channels, cv::Mat(), hist, 1, hist_size, ranges, true, false); + int gray_level = 0; + const int num_pixels_quantile = frame.total()*percentage*0.01f; + int num_pixels_accum = 0; + for (int i=0; i<hist_size[0]; ++i) + { + num_pixels_accum += hist.at<float>(i); + if (num_pixels_accum > num_pixels_quantile) + { + gray_level = i; + break; + } + } + return gray_level; +} + + +// Automatic brightness adjustment. Scales brightness to lie between -.5 and 0.5, roughly. +void normalize_brightness(const cv::Mat& frame, cv::Mat& out) +{ + const float pct = 90; + + const int brightness = find_input_intensity_quantile(frame, pct); + + const double alpha = brightness<127 ? (pct/100.f*0.5f/std::max(5,brightness)) : 1./255; + const double beta = -0.5; + + frame.convertTo(out, CV_32F, alpha, beta); +} + + + +Localizer::Localizer(Ort::MemoryInfo &allocator_info, Ort::Session &&session) : + session_{std::move(session)}, + scaled_frame_(INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH, CV_8U), + input_mat_(INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH, CV_32F) +{ + // Only works when input_mat does not reallocated memory ...which it should not. + // Non-owning memory reference to input_mat? + // Note: shape = (bach x channels x h x w) + const std::int64_t input_shape[4] = { 1, 1, INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH }; + input_val_ = Ort::Value::CreateTensor<float>(allocator_info, input_mat_.ptr<float>(0), input_mat_.total(), input_shape, 4); + + const std::int64_t output_shape[2] = { 1, 5 }; + output_val_ = Ort::Value::CreateTensor<float>(allocator_info, results_.data(), results_.size(), output_shape, 2); +} + + +std::pair<float, cv::Rect2f> Localizer::run( + const cv::Mat &frame) +{ + auto p = input_mat_.ptr(0); + + cv::resize(frame, scaled_frame_, { INPUT_IMG_WIDTH, INPUT_IMG_HEIGHT }, 0, 0, cv::INTER_AREA); + scaled_frame_.convertTo(input_mat_, CV_32F, 1./255., -0.5); + + assert (input_mat_.ptr(0) == p); + assert (!input_mat_.empty() && input_mat_.isContinuous()); + assert (input_mat_.cols == INPUT_IMG_WIDTH && input_mat_.rows == INPUT_IMG_HEIGHT); + + const char* input_names[] = {"x"}; + const char* output_names[] = {"logit_box"}; + + Timer t; t.start(); + + session_.Run(Ort::RunOptions{nullptr}, input_names, &input_val_, 1, output_names, &output_val_, 1); + + last_inference_time_ = t.elapsed_ms(); + + const cv::Rect2f roi = unnormalize(cv::Rect2f{ + results_[1], + results_[2], + results_[3]-results_[1], // Width + results_[4]-results_[2] // Height + }, frame.rows, frame.cols); + const float score = sigmoid(results_[0]); + + return { score, roi }; +} + + +double Localizer::last_inference_time_millis() const +{ + return last_inference_time_; +} + + +PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&session) + : model_version_{session.GetModelMetadata().GetVersion()} + , session_{std::move(session)} + , allocator_{session_, allocator_info} +{ + using namespace std::literals::string_literals; + + if (session_.GetOutputCount() < 2) + throw std::runtime_error("Invalid Model: must have at least two outputs"); + + // WARNING UB .. but still ... + // If the model was saved without meta data, it seems the version field is uninitialized. + // In that case reading from it is UB. However, in practice we will just get some arbitrary number + // which is hopefully different from the numbers used by models where the version is set. + if (model_version_ != 2 && model_version_ != 3) + model_version_ = 1; + + const cv::Size input_image_shape = get_input_image_shape(session_); + + scaled_frame_ = cv::Mat(input_image_shape, CV_8U, cv::Scalar(0)); + input_mat_ = cv::Mat(input_image_shape, CV_32F, cv::Scalar(0.f)); + + { + const std::int64_t input_shape[4] = { 1, 1, input_image_shape.height, input_image_shape.width }; + input_val_.push_back( + Ort::Value::CreateTensor<float>(allocator_info, input_mat_.ptr<float>(0), input_mat_.total(), input_shape, 4)); + } + + struct TensorSpec + { + std::vector<int64_t> shape; + float* buffer = nullptr; + size_t element_count = 0; + bool available = false; + }; + + std::unordered_map<std::string, TensorSpec> understood_outputs = { + { "pos_size", TensorSpec{ { 1, 3 }, &output_coord_[0], output_coord_.rows } }, + { "quat", TensorSpec{ { 1, 4}, &output_quat_[0], output_quat_.rows } }, + { "box", TensorSpec{ { 1, 4}, &output_box_[0], output_box_.rows } }, + { "rotaxis_scales_tril", TensorSpec{ {1, 3, 3}, output_rotaxis_scales_tril_.val, 9 }}, + { "rotaxis_std", TensorSpec{ {1, 3, 3}, output_rotaxis_scales_tril_.val, 9 }}, // TODO: Delete when old models aren't used any more + { "eyes", TensorSpec{ { 1, 2}, output_eyes_.val, output_eyes_.rows }}, + { "pos_size_std", TensorSpec{ {1, 3}, output_coord_scales_.val, output_coord_scales_.rows}}, + { "pos_size_scales", TensorSpec{ {1, 3}, output_coord_scales_.val, output_coord_scales_.rows}}, + //{ "box_std", TensorSpec{ {1, 4}, output_box_scales_.val, output_box_scales_ .rows}} + }; + + qDebug() << "Pose model inputs (" << session_.GetInputCount() << ")"; + qDebug() << "Pose model outputs (" << session_.GetOutputCount() << "):"; + for (size_t i=0; i<session_.GetOutputCount(); ++i) + { + const char* name = session_.GetOutputName(i, allocator_); + const auto& output_info = session_.GetOutputTypeInfo(i); + const auto& onnx_tensor_spec = output_info.GetTensorTypeAndShapeInfo(); + auto my_tensor_spec = understood_outputs.find(name); + + qDebug() << "\t" << name << " (" << onnx_tensor_spec.GetShape() << ") dtype: " << onnx_tensor_spec.GetElementType() << " " << + (my_tensor_spec != understood_outputs.end() ? "ok" : "unknown"); + + if (my_tensor_spec != understood_outputs.end()) + { + TensorSpec& t = my_tensor_spec->second; + if (onnx_tensor_spec.GetShape() != t.shape || + onnx_tensor_spec.GetElementType() != Ort::TypeToTensorType<float>::type) + throw std::runtime_error("Invalid output tensor spec for "s + name); + output_val_.push_back(Ort::Value::CreateTensor<float>( + allocator_info, t.buffer, t.element_count, t.shape.data(), t.shape.size())); + t.available = true; + } + else + { + // Create tensor regardless and ignore output + output_val_.push_back(create_tensor(output_info, allocator_)); + } + output_names_.push_back(name); + } + + has_uncertainty_ = understood_outputs.at("rotaxis_scales_tril").available || + understood_outputs.at("rotaxis_std").available; + has_uncertainty_ &= understood_outputs.at("pos_size_std").available || + understood_outputs.at("pos_size_scales").available; + //has_uncertainty_ &= understood_outputs.at("box_std").available; + has_eye_closed_detection_ = understood_outputs.at("eyes").available; + + // FIXME: Recurrent states + + // size_t num_regular_outputs = 2; + + // num_recurrent_states_ = session_.GetInputCount()-1; + // if (session_.GetOutputCount()-num_regular_outputs != num_recurrent_states_) + // throw std::runtime_error("Invalid Model: After regular inputs and outputs the model must have equal number of inputs and outputs for tensors holding hidden states of recurrent layers."); + + // // Create tensors for recurrent state + // for (size_t i = 0; i < num_recurrent_states_; ++i) + // { + // const auto& input_info = session_.GetInputTypeInfo(1+i); + // const auto& output_info = session_.GetOutputTypeInfo(num_regular_outputs+i); + // if (input_info.GetTensorTypeAndShapeInfo().GetShape() != + // output_info.GetTensorTypeAndShapeInfo().GetShape()) + // throw std::runtime_error("Invalid Model: Tensors for recurrent hidden states should have same shape on intput and output"); + // input_val_.push_back(create_tensor(input_info, allocator_)); + // output_val_.push_back(create_tensor(output_info, allocator_)); + // } + + for (size_t i = 0; i < session_.GetInputCount(); ++i) + { + input_names_.push_back(session_.GetInputName(i, allocator_)); + } + + assert (input_names_.size() == input_val_.size()); + assert (output_names_.size() == output_val_.size()); +} + + +std::optional<PoseEstimator::Face> PoseEstimator::run( + const cv::Mat &frame, const cv::Rect &box) +{ + cv::Mat cropped; + + const int patch_size = std::max(box.width, box.height)*1.05; + const cv::Point2f patch_center = { + std::clamp<float>(box.x + 0.5f*box.width, 0.f, frame.cols), + std::clamp<float>(box.y + 0.5f*box.height, 0.f, frame.rows) + }; + cv::getRectSubPix(frame, {patch_size, patch_size}, patch_center, cropped); + + // Will get failure if patch_center is outside image boundaries settings. + // Have to catch this case. + if (cropped.rows != patch_size || cropped.cols != patch_size) + return {}; + + [[maybe_unused]] auto* p = input_mat_.ptr(0); + + cv::resize(cropped, scaled_frame_, scaled_frame_.size(), 0, 0, cv::INTER_AREA); + + normalize_brightness(scaled_frame_, input_mat_); + + assert (input_mat_.ptr(0) == p); + assert (!input_mat_.empty() && input_mat_.isContinuous()); + + Timer t; t.start(); + + try + { + session_.Run( + Ort::RunOptions{ nullptr }, + input_names_.data(), + input_val_.data(), + input_val_.size(), + output_names_.data(), + output_val_.data(), + output_val_.size()); + } + catch (const Ort::Exception &e) + { + qDebug() << "Failed to run the model: " << e.what(); + return {}; + } + + for (size_t i = 0; i<num_recurrent_states_; ++i) + { + // Next step, the current output becomes the input. + // Thus we realize the recurrent connection. + // Only swaps the internal pointers. There is no copy of data. + std::swap( + output_val_[output_val_.size()-num_recurrent_states_+i], + input_val_[input_val_.size()-num_recurrent_states_+i]); + } + + // FIXME: Execution time fluctuates wildly. 19 to 26 msec. Why? + // The instructions are always the same. Maybe a memory allocation + // issue. The ONNX api suggests that tensor are allocated in an + // arena. Does that matter? Maybe the issue is something else? + + last_inference_time_ = t.elapsed_ms(); + + // Perform coordinate transformation. + // From patch-local normalized in [-1,1] to + // frame unnormalized pixel coordinatesettings. + + const cv::Point2f center = patch_center + + (0.5f*patch_size)*cv::Point2f{output_coord_[0], output_coord_[1]}; + + cv::Point2f center_stddev = { + (0.5f*patch_size)*output_coord_scales_[0], + (0.5f*patch_size)*output_coord_scales_[1] }; + + const float size = patch_size*0.5f*output_coord_[2]; + + float size_stddev = patch_size*0.5f*output_coord_scales_[2]; + + // Following Eigen which uses quat components in the order w, x, y, z. + // As does OpenCV + cv::Quatf rotation = { + output_quat_[3], + output_quat_[0], + output_quat_[1], + output_quat_[2] }; + + // Should be lower triangular. If not maybe something is wrong with memory layout ... or the model. + assert(output_rotaxis_scales_tril_(0, 1) == 0); + assert(output_rotaxis_scales_tril_(0, 2) == 0); + assert(output_rotaxis_scales_tril_(1, 2) == 0); + + cv::Matx33f rotaxis_scales_tril = output_rotaxis_scales_tril_; + + if (model_version_ < 2) + { + // Due to a change in coordinate conventions + rotation = world_to_image(rotation); + } + + const cv::Rect2f outbox = { + patch_center.x + (0.5f*patch_size)*output_box_[0], + patch_center.y + (0.5f*patch_size)*output_box_[1], + 0.5f*patch_size*(output_box_[2]-output_box_[0]), + 0.5f*patch_size*(output_box_[3]-output_box_[1]) + }; + // const RoiCorners outbox = { + // patch_center + 0.5f*patch_size*cv::Point2f{output_box_[0], output_box_[1]}, + // patch_center + 0.5f*patch_size*cv::Point2f{output_box_[2], output_box_[3]} + // }; + // RoiCorners outbox_stddev = { + // 0.5f*patch_size*cv::Point2f{output_box_scales_[0], output_box_scales_[1]}, + // 0.5f*patch_size*cv::Point2f{output_box_scales_[2], output_box_scales_[3]} + // }; + + // Because the model is sensitive to closing eyes we increase the uncertainty + // a lot to make the subsequent filtering smooth the output more. This should suppress + // "twitching" when the user blinks. + if (has_eye_closed_detection_) + { + const float eye_open = std::min(output_eyes_[0], output_eyes_[1]); + const float increase_factor = 1.f + 10.f * std::pow(1. - eye_open,4.f); + rotaxis_scales_tril *= increase_factor; + size_stddev *= increase_factor; + center_stddev *= increase_factor; + } + + return std::optional<Face>({ + rotation, rotaxis_scales_tril, outbox, center, center_stddev, size, size_stddev + }); +} + + +cv::Mat PoseEstimator::last_network_input() const +{ + assert(!input_mat_.empty()); + cv::Mat ret; + input_mat_.convertTo(ret, CV_8U, 255., 127.); + cv::cvtColor(ret, ret, cv::COLOR_GRAY2RGB); + return ret; +} + + +double PoseEstimator::last_inference_time_millis() const +{ + return last_inference_time_; +} + + + + + +} // namespace neuralnet_tracker_ns
\ No newline at end of file diff --git a/tracker-neuralnet/model_adapters.h b/tracker-neuralnet/model_adapters.h new file mode 100644 index 00000000..3fbfb861 --- /dev/null +++ b/tracker-neuralnet/model_adapters.h @@ -0,0 +1,102 @@ +#pragma once + +#include <optional> +#include <array> +#include <vector> + +#include <onnxruntime_cxx_api.h> +#include <opencv2/core.hpp> +#include "opencv_contrib.h" + + +namespace neuralnet_tracker_ns +{ + +// Generally useful sigmoid function +float sigmoid(float x); + + +class Localizer +{ + public: + Localizer(Ort::MemoryInfo &allocator_info, + Ort::Session &&session); + + // Returns bounding wrt image coordinate of the input image + // The preceeding float is the score for being a face normalized to [0,1]. + std::pair<float, cv::Rect2f> run( + const cv::Mat &frame); + + double last_inference_time_millis() const; + private: + inline static constexpr int INPUT_IMG_WIDTH = 288; + inline static constexpr int INPUT_IMG_HEIGHT = 224; + Ort::Session session_{nullptr}; + // Inputs / outputs + cv::Mat scaled_frame_{}, input_mat_{}; + Ort::Value input_val_{nullptr}, output_val_{nullptr}; + std::array<float, 5> results_; + double last_inference_time_ = 0; +}; + + +class PoseEstimator +{ + public: + struct Face + { + cv::Quatf rotation; + cv::Matx33f rotaxis_cov_tril; // Lower triangular factor of Cholesky decomposition + cv::Rect2f box; + cv::Point2f center; + cv::Point2f center_stddev; + float size; + float size_stddev; + }; + + PoseEstimator(Ort::MemoryInfo &allocator_info, + Ort::Session &&session); + /** Inference + * + * Coordinates are defined wrt. the image space of the input `frame`. + * X goes right, Z (depth) into the image, Y points down (like pixel coordinates values increase from top to bottom) + */ + std::optional<Face> run(const cv::Mat &frame, const cv::Rect &box); + // Returns an image compatible with the 'frame' image for displaying. + cv::Mat last_network_input() const; + double last_inference_time_millis() const; + bool has_uncertainty() const { return has_uncertainty_; } + + private: + int64_t model_version_ = 0; // Queried meta data from the ONNX file + Ort::Session session_{nullptr}; // ONNX's runtime context for running the model + Ort::Allocator allocator_; // Memory allocator for tensors + // Inputs + cv::Mat scaled_frame_{}, input_mat_{}; // Input. One is the original crop, the other is rescaled (?) + std::vector<Ort::Value> input_val_; // Tensors to put into the model + std::vector<const char*> input_names_; // Refers to the names in the onnx model. + // Outputs + cv::Vec<float, 3> output_coord_{}; // 2d Coordinate and head size output. + cv::Vec<float, 4> output_quat_{}; // Quaternion output + cv::Vec<float, 4> output_box_{}; // Bounding box output + cv::Matx33f output_rotaxis_scales_tril_{}; // Lower triangular matrix of LLT factorization of covariance of rotation vector as offset from output quaternion + cv::Vec<float, 2> output_eyes_{}; + cv::Vec<float, 3> output_coord_scales_{}; + std::vector<Ort::Value> output_val_; // Tensors to put the model outputs in. + std::vector<const char*> output_names_; // Refers to the names in the onnx model. + // More bookkeeping + size_t num_recurrent_states_ = 0; + double last_inference_time_ = 0; + bool has_uncertainty_ = false; + bool has_eye_closed_detection_ = false; +}; + + +// Finds the intensity where x percent of pixels have less intensity than that. +int find_input_intensity_quantile(const cv::Mat& frame, float percentage); + +// Adjust brightness levels to full range and scales the value range to [-0.5, 0.5] +void normalize_brightness(const cv::Mat& frame, cv::Mat& out); + + +} // namespace neuralnet_tracker_ns
\ No newline at end of file diff --git a/tracker-neuralnet/opencv_contrib.h b/tracker-neuralnet/opencv_contrib.h new file mode 100644 index 00000000..af92c12f --- /dev/null +++ b/tracker-neuralnet/opencv_contrib.h @@ -0,0 +1,120 @@ +#pragma once + +#include <opencv2/core.hpp> +#include <opencv2/core/base.hpp> +#include <opencv2/core/quaternion.hpp> + +// Well eventually it might be a contribution + +namespace cvcontrib +{ + + +template<class T> +cv::Point_<T> as_point(const cv::Size_<T>& s) +{ + return { s.width, s.height }; +} + + +template<class T> +cv::Size_<T> as_size(const cv::Point_<T>& p) +{ + return { p.x, p.y }; +} + + +template<int n, int m> +inline bool allfinite(const cv::Matx<float, n, m> &mat) +{ + const size_t sz = mat.rows*mat.cols; + for (size_t i=0; i<sz; ++i) + if (!std::isfinite(mat.val[i])) + return false; + return true; +} + + +// Because compiler refuses to convert it automatically +template<int n> +inline cv::Vec<float, n> to_vec(const cv::Matx<float, n, 1>& m) +{ + return cv::Vec<float,n>{m.val}; +} + + +template<int n, int m, int o> +inline void set_minor(cv::Vec<float, m> &dst, const int startrow, const cv::Matx<float, o, 1> &src) +{ + assert (startrow>=0 && startrow+n <= dst.rows); + for (int row=startrow, i=0; row<startrow+n; ++row,++i) + { + dst[row] = src(i,0); + } +} + + +template<int nrows, int ncols, int m, int n> +inline void set_minor(cv::Matx<float, m, n>& dst, const int startrow, int startcol, const cv::Matx<float, nrows, ncols> &src) +{ + assert (startrow>=0 && startrow+nrows <= dst.rows); + assert (startcol>=0 && startcol+ncols <= dst.cols); + for (int row=startrow, i=0; row<startrow+nrows; ++row,++i) + { + for (int col=startcol, j=0; col<startcol+ncols; ++col,++j) + { + dst(row, col) = src(i,j); + } + } +} + + +inline cv::Quatf identity_quat() +{ + return cv::Quatf(1,0,0,0); +} + + +inline cv::Vec3f toRotVec(const cv::Quatf& q) +{ + // This is an improved implementation +#if 1 + // w = cos(alpha/2) + // xyz = sin(alpha/2)*axis + static constexpr float eps = 1.e-12; + const cv::Vec3f xyz{q.x, q.y, q.z}; + const float len = cv::norm(xyz); + const float angle = std::atan2(len, q.w)*2.f; + return xyz*(angle/(len+eps)); +#else + // The opencv implementation fails even the simplest test: + // out = toRVec(cv::Quatf{1., 0., 0., 0. }); + // ASSERT_TRUE(std::isfinite(out[0]) && std::isfinite(out[1]) && std::isfinite(out[2])); + return q.toRotVec(); +#endif +} + + +inline cv::Vec3f rotate(const cv::Quatf& q, const cv::Vec3f &v) +{ + const auto r = q * cv::Quatf{0., v[0], v[1], v[2]} * q.conjugate(); + return { r.x, r.y, r.z }; +} + + +template<int n> +inline cv::Matx<float, n, n> cholesky(const cv::Matx<float, n, n>& mat) +{ + cv::Matx<float, n, n> l = mat; + // Der Code ist die Doku! + // https://github.com/opencv/opencv/blob/4.5.4/modules/core/src/matrix_decomp.cpp#L95 + cv::Cholesky(l.val, l.cols * sizeof(float), n, nullptr, 0, 0); + // It doesn't clear the upper triangle so we do it for it. + for (int row=0; row<n; ++row) + for (int col=row+1; col<n; ++col) + l(row, col) = 0.f; + return l; +} + + +} // namespace cvcontrib
\ No newline at end of file diff --git a/tracker-neuralnet/preview.cpp b/tracker-neuralnet/preview.cpp new file mode 100644 index 00000000..76a6bbc0 --- /dev/null +++ b/tracker-neuralnet/preview.cpp @@ -0,0 +1,135 @@ +#include "preview.h" + + +namespace neuralnet_tracker_ns +{ + + +cv::Rect make_crop_rect_for_aspect(const cv::Size &size, int aspect_w, int aspect_h) +{ + auto [w, h] = size; + if ( w*aspect_h > aspect_w*h ) + { + // Image is too wide + const int new_w = (aspect_w*h)/aspect_h; + return cv::Rect((w - new_w)/2, 0, new_w, h); + } + else + { + const int new_h = (aspect_h*w)/aspect_w; + return cv::Rect(0, (h - new_h)/2, w, new_h); + } +} + + + + +void Preview::init(const cv_video_widget& widget) +{ + auto [w,h] = widget.preview_size(); + preview_size_ = { w, h }; +} + + +void Preview::copy_video_frame(const cv::Mat& frame) +{ + cv::Rect roi = make_crop_rect_for_aspect(frame.size(), preview_size_.width, preview_size_.height); + + cv::resize(frame(roi), preview_image_, preview_size_, 0, 0, cv::INTER_NEAREST); + + offset_ = { (float)-roi.x, (float)-roi.y }; + scale_ = float(preview_image_.cols) / float(roi.width); +} + + +void Preview::draw_gizmos( + const std::optional<PoseEstimator::Face> &face, + const std::optional<cv::Rect2f>& last_roi, + const std::optional<cv::Rect2f>& last_localizer_roi, + const cv::Point2f& neckjoint_position) +{ + if (preview_image_.empty()) + return; + + if (last_roi) + { + const int col = 255; + cv::rectangle(preview_image_, transform(*last_roi), cv::Scalar(0, col, 0), /*thickness=*/1); + } + if (last_localizer_roi) + { + const int col = 255; + cv::rectangle(preview_image_, transform(*last_localizer_roi), cv::Scalar(col, 0, 255-col), /*thickness=*/1); + } + + if (face) + { + if (face->size>=1.f) + cv::circle(preview_image_, static_cast<cv::Point>(transform(face->center)), int(transform(face->size)), cv::Scalar(255,255,255), 2); + cv::circle(preview_image_, static_cast<cv::Point>(transform(face->center)), 3, cv::Scalar(255,255,255), -1); + + const cv::Matx33f R = face->rotation.toRotMat3x3(cv::QUAT_ASSUME_UNIT); + + auto draw_coord_line = [&](int i, const cv::Scalar& color) + { + const float vx = R(0,i); + const float vy = R(1,i); + static constexpr float len = 100.f; + cv::Point q = face->center + len*cv::Point2f{vx, vy}; + cv::line(preview_image_, static_cast<cv::Point>(transform(face->center)), static_cast<cv::Point>(transform(q)), color, 2); + }; + draw_coord_line(0, {0, 0, 255}); + draw_coord_line(1, {0, 255, 0}); + draw_coord_line(2, {255, 0, 0}); + + // Draw the computed joint position + auto xy = transform(neckjoint_position); + cv::circle(preview_image_, cv::Point(xy.x,xy.y), 5, cv::Scalar(0,0,255), -1); + } + + +} + +void Preview::overlay_netinput(const cv::Mat& netinput) +{ + if (netinput.empty()) + return; + + const int w = std::min(netinput.cols, preview_image_.cols); + const int h = std::min(netinput.rows, preview_image_.rows); + cv::Rect roi(0, 0, w, h); + netinput(roi).copyTo(preview_image_(roi)); +} + +void Preview::draw_fps(double fps, double last_inference_time) +{ + char buf[128]; + ::snprintf(buf, sizeof(buf), "%d Hz, pose inference: %d ms", std::clamp(int(fps), 0, 9999), int(last_inference_time)); + cv::putText(preview_image_, buf, cv::Point(10, preview_image_.rows-10), cv::FONT_HERSHEY_PLAIN, 1, cv::Scalar(0, 255, 0), 1); +} + + +void Preview::copy_to_widget(cv_video_widget& widget) +{ + if (preview_image_.rows > 0) + widget.update_image(preview_image_); +} + + +cv::Rect2f Preview::transform(const cv::Rect2f& r) const +{ + return { (r.x - offset_.x)*scale_, (r.y - offset_.y)*scale_, r.width*scale_, r.height*scale_ }; +} + +cv::Point2f Preview::transform(const cv::Point2f& p) const +{ + return { (p.x - offset_.x)*scale_ , (p.y - offset_.y)*scale_ }; +} + +float Preview::transform(float s) const +{ + return s * scale_; +} + + +}
\ No newline at end of file diff --git a/tracker-neuralnet/preview.h b/tracker-neuralnet/preview.h new file mode 100644 index 00000000..adc12993 --- /dev/null +++ b/tracker-neuralnet/preview.h @@ -0,0 +1,60 @@ +/* Copyright (c) 2021 Michael Welter <michael@welter-4d.de> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + */ + +#pragma once + +#include "model_adapters.h" + +#include "cv/video-widget.hpp" + +#include <optional> + +#include <opencv2/core.hpp> +#include <opencv2/imgproc.hpp> + + +namespace neuralnet_tracker_ns +{ + +/** Makes a maximum size cropping rect with the given aspect. +* @param aspect_w: nominator of the aspect ratio +* @param aspect_h: denom of the aspect ratio +*/ +cv::Rect make_crop_rect_for_aspect(const cv::Size &size, int aspect_w, int aspect_h); + + +/** This class is responsible for drawing the debug/info gizmos +* +* In addition there function to transform the inputs to the size of +* the preview image which can be different from the camera frame. +*/ +class Preview +{ +public: + void init(const cv_video_widget& widget); + void copy_video_frame(const cv::Mat& frame); + void draw_gizmos( + const std::optional<PoseEstimator::Face> &face, + const std::optional<cv::Rect2f>& last_roi, + const std::optional<cv::Rect2f>& last_localizer_roi, + const cv::Point2f& neckjoint_position); + void overlay_netinput(const cv::Mat& netinput); + void draw_fps(double fps, double last_inference_time); + void copy_to_widget(cv_video_widget& widget); +private: + // Transform from camera image to preview + cv::Rect2f transform(const cv::Rect2f& r) const; + cv::Point2f transform(const cv::Point2f& p) const; + float transform(float s) const; + + cv::Mat preview_image_; + cv::Size preview_size_ = { 0, 0 }; + float scale_ = 1.f; + cv::Point2f offset_ = { 0.f, 0.f}; +}; + +} // neuralnet_tracker_ns
\ No newline at end of file diff --git a/tracker-neuralnet/tests.cpp b/tracker-neuralnet/tests.cpp new file mode 100644 index 00000000..b1d2a6d0 --- /dev/null +++ b/tracker-neuralnet/tests.cpp @@ -0,0 +1,58 @@ +#include "model_adapters.h" + +#include <algorithm> +#include <numeric> +#include <cstdio> + +namespace neuralnet_tracker_tests +{ + + +void assert_(bool ok, const std::string& msg) +{ + if (ok) + return; + std::cout << msg << std::endl; + std::exit(-1); +} + + +void test_find_input_intensity_quantile() +{ + cv::Mat data(10,10, CV_8UC1); + std::iota(data.begin<uint8_t>(), data.end<uint8_t>(), 0); + + const float pct = 90; + + const int val = neuralnet_tracker_ns::find_input_intensity_quantile(data, pct); + + assert_(val == int(10*10*pct/100.f), "test_find_input_intensity_quantile failed"); +} + + +void test_normalize_brightness() +{ + cv::Mat data(10,10, CV_8UC1); + std::iota(data.begin<uint8_t>(), data.end<uint8_t>(), 0); + + cv::Mat out; + neuralnet_tracker_ns::normalize_brightness(data, out); + + auto [minit,maxit] = std::minmax_element(out.begin<float>(),out.end<float>()); + const auto minval = *minit; + const auto maxval = *maxit; + assert_(std::abs(minval + 0.5f) < 0.02, "test_normalize_brightness failed"); + // If the brightest value is lower than half-max, it will be boosted to half-max. + // Otherwise it will just be rescaled to [-.5, 0.5 ]. Here we have the low-brightness case. + assert_(std::abs(maxval - 0.0f) < 0.02, "test_normalize_brightness failed"); +} + + +void run() +{ + test_find_input_intensity_quantile(); + test_normalize_brightness(); +} + + +}
\ No newline at end of file diff --git a/tracker-neuralnet/unscented_trafo.h b/tracker-neuralnet/unscented_trafo.h new file mode 100644 index 00000000..267aa969 --- /dev/null +++ b/tracker-neuralnet/unscented_trafo.h @@ -0,0 +1,132 @@ +#pragma once + +#include <algorithm> +#include <opencv2/core.hpp> +#include <opencv2/core/base.hpp> +#include <opencv2/core/quaternion.hpp> + +#include <cmath> +#include <vector> + +#include "opencv_contrib.h" + +namespace ukf_cv +{ + +using namespace cvcontrib; + +template<int dim, int otherdim = dim> +using SigmaPoints = std::array<cv::Vec<float,otherdim>,dim*2+1>; + + +// Ported from +// https://filterpy.readthedocs.io/en/latest/_modules/filterpy/kalman/sigma_points.html +// Excerpt from the original docu: +// " + +// Generates sigma points and weights according to Van der Merwe's +// 2004 dissertation[1] for the UnscentedKalmanFilter class.. It +// parametizes the sigma points using alpha, beta, kappa terms, and +// is the version seen in most publications. + +// Unless you know better, this should be your default choice. + +// alpha : float +// Determins the spread of the sigma points around the mean. +// Usually a small positive value (1e-3) according to [3]. + +// beta : float +// Incorporates prior knowledge of the distribution of the mean. For +// Gaussian x beta=2 is optimal, according to [3]. + +// kappa : float, default=0.0 +// Secondary scaling parameter usually set to 0 according to [4], +// or to 3-n according to [5]. + +// Reference +// .. [1] R. Van der Merwe "Sigma-Point Kalman Filters for Probabilitic +// Inference in Dynamic State-Space Models" (Doctoral dissertation) + +// " +template<int dim> +class MerweScaledSigmaPoints +{ +public: + static constexpr int num_sigmas = 2*dim+1; + + using Vector = cv::Vec<float,dim>; + using Matrix = cv::Matx<float,dim,dim>; + + MerweScaledSigmaPoints(float alpha = 0.01, float beta = 2., int kappa = 3-dim) + { + lambda = alpha*alpha * (dim + kappa) - dim; + const float c = .5 / (dim + lambda); + Wc_i = c; + Wm_i = c; + Wm_0 = lambda / (dim+lambda); + Wc_0 = Wm_0 + (1.-alpha*alpha + beta); + } + + SigmaPoints<dim> compute_sigmas(const Vector &mu, const Matrix &mat, bool is_tril_factor) const + { + const Matrix triu_factor = is_tril_factor ? mat.t() : cholesky(mat).t(); + + const Matrix U = triu_factor*std::sqrt(lambda+dim); + + SigmaPoints<dim> sigmas; + + sigmas[0] = mu; + for (int k=0; k<dim; ++k) + { + sigmas[k+1] = to_vec(mu + U.row(k).t()); + sigmas[dim+k+1] = to_vec(mu - U.row(k).t()); + } + return sigmas; + } + + template<int otherdim> + std::tuple<cv::Vec<float,otherdim> , cv::Matx<float,otherdim,otherdim>> compute_statistics(const SigmaPoints<dim,otherdim> &sigmas) const + { + cv::Vec<float,otherdim> mu{}; // Zero initializes + for (size_t i=0; i<sigmas.size(); ++i) + { + mu += to_vec((i==0 ? Wm_0 : Wm_i) * sigmas[i]); + } + + cv::Matx<float,otherdim,otherdim> cov{}; + for (size_t i=0; i<sigmas.size(); ++i) + { + const auto p = sigmas[i] - mu; + cov += (i==0 ? Wc_0 : Wc_i)*p*p.t(); + } + + return { mu, cov }; + } + + template<int otherdim> + cv::Matx<float,dim,otherdim> compute_cov(const SigmaPoints<dim,dim> &sigmas, const SigmaPoints<dim,otherdim> &othersigmas) const + { + cv::Vec<float,dim> mu{}; // Zero initializes + cv::Vec<float,otherdim> mu_other{}; // Zero initializes + for (size_t i=0; i<sigmas.size(); ++i) + { + mu += to_vec((i==0 ? Wm_0 : Wm_i) * sigmas[i]); + mu_other += to_vec((i==0 ? Wm_0 : Wm_i) * othersigmas[i]); + } + + cv::Matx<float,dim,otherdim> cov{}; + for (size_t i=0; i<sigmas.size(); ++i) + { + const auto p = sigmas[i] - mu; + const auto q = othersigmas[i] - mu_other; + cov += (i==0 ? Wc_0 : Wc_i)*p*q.t(); + } + + return cov; + } +private: + float Wc_i, Wm_i, Wm_0, Wc_0, lambda; +}; + + +} // namespace ukf_cv
\ No newline at end of file |