diff options
| author | Michael Welter <michael@welter-4d.de> | 2022-09-11 20:55:26 +0200 | 
|---|---|---|
| committer | Stanislaw Halik <sthalik@misaki.pl> | 2022-11-01 13:51:35 +0100 | 
| commit | 08f1fcad1c74e25f97641a0ccbd229b267ec528c (patch) | |
| tree | 000b1b276bc7df4a74fd493dab05bcce68801de8 /tracker-neuralnet | |
| parent | 77d6abaf53dbe2ee6334bd59b112e25d694a2f65 (diff) | |
tracker/nn: Tweaks, refactoring, a deadzone filtering and support for uncertainty estimation
* Add rudimentary test for two functions .. maybe more in future
* Fix the rotation correction from vertical translation
* Move preview class to new files
* Move neural network model adapters to new files
* Add utility functions for opencv
* Query the model inputs/outputs by name to see what is available
* Supports outputs for standard deviation of the data distribution -
  What you get if you let your model output the full parameters of a
  gaussian distribution (depending on the inputs) and fit it with
  negative log likelihood loss.
* Disabled support for sequence models
* Add support for detection of eye open/close classification.
  Scale uncertainty estimate up if eyes closed
* Add a deadzone filter which activates if the model supports uncertainty
  quantification. The deadzone scales becomes larger the more uncertain
  the model/data are. This is mostly supposed to be useful to suppress
  large estimate errors when the user blinks with the eyes
* Fix distance being twice of what it should have been
Diffstat (limited to 'tracker-neuralnet')
| -rw-r--r-- | tracker-neuralnet/deadzone_filter.cpp | 162 | ||||
| -rw-r--r-- | tracker-neuralnet/deadzone_filter.h | 37 | ||||
| -rw-r--r-- | tracker-neuralnet/ftnoir_tracker_neuralnet.cpp | 822 | ||||
| -rw-r--r-- | tracker-neuralnet/ftnoir_tracker_neuralnet.h | 127 | ||||
| -rw-r--r-- | tracker-neuralnet/model_adapters.cpp | 433 | ||||
| -rw-r--r-- | tracker-neuralnet/model_adapters.h | 102 | ||||
| -rw-r--r-- | tracker-neuralnet/opencv_contrib.h | 120 | ||||
| -rw-r--r-- | tracker-neuralnet/preview.cpp | 135 | ||||
| -rw-r--r-- | tracker-neuralnet/preview.h | 60 | ||||
| -rw-r--r-- | tracker-neuralnet/tests.cpp | 58 | ||||
| -rw-r--r-- | tracker-neuralnet/unscented_trafo.h | 132 | 
11 files changed, 1441 insertions, 747 deletions
| diff --git a/tracker-neuralnet/deadzone_filter.cpp b/tracker-neuralnet/deadzone_filter.cpp new file mode 100644 index 00000000..b41afdba --- /dev/null +++ b/tracker-neuralnet/deadzone_filter.cpp @@ -0,0 +1,162 @@ +#include "deadzone_filter.h" +#include "model_adapters.h" +#include "opencv_contrib.h" +#include "unscented_trafo.h" + +#include <opencv2/core/base.hpp> +#include <opencv2/core/matx.hpp> +#include <opencv2/core/quaternion.hpp> + +namespace neuralnet_tracker_ns +{ + +using namespace cvcontrib; + +using StateVec = cv::Vec<float,6>; +using StateCov = cv::Matx<float,6,6>; + +static constexpr int num_sigmas = ukf_cv::MerweScaledSigmaPoints<6>::num_sigmas; +static constexpr float img_scale = 200.f; +static constexpr float world_scale = 1000.f; // mm + +StateCov make_tangent_space_uncertainty_tril(const PoseEstimator::Face &face) +{ +    StateCov tril = StateCov::eye(); +    tril(0,0) = face.center_stddev.x/img_scale; +    tril(1,1) = face.center_stddev.y/img_scale; +    tril(2,2) = face.size_stddev/img_scale; +    set_minor<3,3>(tril, 3, 3, face.rotaxis_cov_tril); +    return tril; +} + + +QuatPose apply_offset(const QuatPose& pose, const StateVec& offset) +{ +    // Unpack +    const cv::Vec3f dp = { offset[0], offset[1], offset[2] }; +    const cv::Quatf dr = cv::Quatf::createFromRvec(cv::Vec3f{ offset[3], offset[4], offset[5] }); +    const auto p = pose.pos + dp; +    const auto r = pose.rot * dr; +    return { r, p }; +} + + +PoseEstimator::Face apply_offset(const PoseEstimator::Face& face, const StateVec& offset) +{ +    const cv::Quatf dr = cv::Quatf::createFromRvec(cv::Vec3f{ offset[3], offset[4], offset[5] }); +    const auto r = face.rotation * dr; +     +    const cv::Point2f p = {  +        face.center.x + offset[0]*img_scale,  +        face.center.y + offset[1]*img_scale  +    }; + +    // Intercept the case where the head size stddev became so large that the sigma points +    // were created with negative head size (mean - constant*stddev ...). Negative head size +    // is bad. But this is fine. The unscented transform where this function comes into play +    // is designed to handle non-linearities like this. +    const float sz = std::max(0.1f*face.size, face.size + offset[2]*img_scale); + +    return PoseEstimator::Face{ +        r, +        {}, +        {}, +        p, +        {}, +        sz, +        {} +    }; +} + + +StateVec relative_to(const QuatPose& reference, const QuatPose& pose) +{ +    const auto p = pose.pos - reference.pos; +    const auto r = toRotVec(reference.rot.conjugate()*pose.rot); +    return StateVec{ p[0], p[1], p[2], r[0], r[1], r[2] }; +} + + +ukf_cv::SigmaPoints<6> relative_to(const QuatPose& pose, const std::array<QuatPose,num_sigmas>& sigmas) +{ +    ukf_cv::SigmaPoints<6> out; +    std::transform(sigmas.begin(), sigmas.end(), out.begin(), [&pose](const QuatPose& s) { +        return relative_to(pose, s); +    }); +    return out; +} + + +std::array<QuatPose,num_sigmas> compute_world_pose_from_sigma_point(const PoseEstimator::Face& face, const ukf_cv::SigmaPoints<6>& sigmas, Face2WorldFunction face2world) +{ +    std::array<QuatPose,num_sigmas> out; +    std::transform(sigmas.begin(), sigmas.end(), out.begin(), [face2world=std::move(face2world), &face](const StateVec& sigma_point) { +        // First unpack the state vector and generate quaternion rotation w.r.t image space. +        const auto sigma_face = apply_offset(face, sigma_point); +        // Then transform ... +        QuatPose pose = face2world(sigma_face.rotation, sigma_face.center, sigma_face.size); +        pose.pos /= world_scale; +        return pose; +    }); +    return out; +} + + +StateVec apply_filter_to_offset(const StateVec& offset, const StateCov& offset_cov, float, const FiltParams& params) +{ +    // Offset and Cov represent a multivariate normal distribution, which is the probability of the new pose measured w.r.t the previous one. +    // Prob(x) ~exp(-(x-mu)t Cov^-1 (x-mu)) +    // We want to attenuate this offset, or zero it out completely, to obtain a deadzone-filter behaviour. The size of the deadzone shall be +    // determined by the covariance projected to the offset direction like so: +    // Take x = mu - mu / |mu| * alpha +    // p(alpha) ~exp(-alpha^2 / |mu|^2 * mut Cov^-1 mu) = ~exp(-alpha^2 / sigma^2) with sigma^2 = mut Cov^-1 mu / |mu|^2. +    // So this projection is like a 1d normal distribution with some standard deviation, which we take to scale the deadzone. + +    bool ok = true; + +    const float len_div_sigma_sqr = offset.dot(offset_cov.inv(cv::DECOMP_CHOLESKY, &ok) * offset); +     +    const float attenuation = (ok) ? sigmoid((std::sqrt(len_div_sigma_sqr) - params.deadzone_size)*params.deadzone_hardness) : 1.f; + +    // { +    //     std::cout << "cov diag: " << offset_cov.diag() << std::endl; +    //     std::cout << "offset: " << cv::norm(offset) << std::endl; +    //     std::cout << "len_div_sigma_sqr: " << cv::norm(len_div_sigma_sqr) << std::endl; +    //      std::cout << "attenuation (" << ok << "): " << attenuation << std::endl; +    // } + +    return offset*attenuation; +} + + +QuatPose apply_filter(const PoseEstimator::Face &face, const QuatPose& previous_pose_, float dt, Face2WorldFunction face2world, const FiltParams& params) +{ +    ukf_cv::MerweScaledSigmaPoints<6> unscentedtrafo; +    auto previous_pose = previous_pose_; +    previous_pose.pos /= world_scale; + +    // Here we have the covariance matrix for the offset from the observed values in `face`. +    const auto cov_tril = make_tangent_space_uncertainty_tril(face); + +    // The filter uses an unscented transform to translate that into a distribution for the offset from the previous pose. +    const ukf_cv::SigmaPoints<6> sigmas = unscentedtrafo.compute_sigmas(to_vec(StateVec::zeros()), cov_tril, true); + +    // We have many of these sigma points. This is why that callback comes into play here. +    const std::array<QuatPose,num_sigmas> pose_sigmas = compute_world_pose_from_sigma_point(face, sigmas, std::move(face2world)); + +    const ukf_cv::SigmaPoints<6> deltas_sigmas = relative_to(previous_pose, pose_sigmas); + +    const auto [offset, offset_cov] = unscentedtrafo.compute_statistics(deltas_sigmas); + +    // Then the deadzone is applied to the offset and finally the previous pose is transformed by the offset to arrive at the final output. +    const StateVec scaled_offset = apply_filter_to_offset(offset, offset_cov, dt, params); + +    QuatPose new_pose = apply_offset(previous_pose, scaled_offset); + +    new_pose.pos *= world_scale; + +    return new_pose; +} + + +} // namespace neuralnet_tracker_ns
\ No newline at end of file diff --git a/tracker-neuralnet/deadzone_filter.h b/tracker-neuralnet/deadzone_filter.h new file mode 100644 index 00000000..a9b6aada --- /dev/null +++ b/tracker-neuralnet/deadzone_filter.h @@ -0,0 +1,37 @@ +#pragma once + +#include "unscented_trafo.h" +#include "opencv_contrib.h" +#include "model_adapters.h" + +namespace neuralnet_tracker_ns +{ + +/// Represents a 6d pose by quaternion rotation and position vector. +struct QuatPose { +    cv::Quatf rot; +    cv::Vec3f pos; +}; + +struct FiltParams +{ +    float deadzone_hardness = 1.f; +    float deadzone_size = 1.f; +}; + +/** Callback type for converting data from the `Face` struct to a 6d pose. +* +* This callback is needed because it depends on things that the filter doesn't have to know about and it is called multiple times +* due to the way how uncertainty estimates are handled +*/ +using Face2WorldFunction = std::function<QuatPose (const cv::Quatf&, const cv::Point2f&, float)>; + +/** Applies a deadzone filter similar to the one used in the Hamilton filter. +* +* What sets this apart is that the deadzone size scales with the uncertainty estimate of the network. +* The rotation uncertainty is represented by a covariance matrix for the distribution of a rotation vector which +* describes the offset from the mean rotation (the quaternion in the `Face` struct). +*/ +QuatPose apply_filter(const PoseEstimator::Face &face, const QuatPose& previous_pose, float dt, Face2WorldFunction face2world, const FiltParams& params); + +} // namespace neuralnet_tracker_ns
\ No newline at end of file diff --git a/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp b/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp index 62209978..b56b4e96 100644 --- a/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp +++ b/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp @@ -6,17 +6,19 @@   */  #include "ftnoir_tracker_neuralnet.h" +#include "deadzone_filter.h" +#include "opencv_contrib.h" +  #include "compat/sleep.hpp"  #include "compat/math-imports.hpp" -#include "cv/init.hpp" -#include <opencv2/core.hpp> -#include <opencv2/core/hal/interface.h> -#include <opencv2/core/types.hpp> -#include <opencv2/calib3d.hpp> -#include <opencv2/imgcodecs.hpp>  #include "compat/timer.hpp"  #include "compat/check-visible.hpp" +#include "cv/init.hpp" +  #include <omp.h> +#include <onnxruntime_cxx_api.h> +#include <opencv2/core.hpp> +#include <opencv2/core/quaternion.hpp>  #ifdef _MSC_VER  #   pragma warning(disable : 4702) @@ -32,7 +34,7 @@  #include <chrono>  #include <string>  #include <stdexcept> - +#include <unordered_map>  // Some demo code for onnx  // https://github.com/microsoft/onnxruntime/blob/master/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/C_Api_Sample.cpp @@ -41,12 +43,11 @@  namespace neuralnet_tracker_ns  { +using namespace cvcontrib;  using numeric_types::vec3;  using numeric_types::vec2;  using numeric_types::mat33; -using quat = std::array<numeric_types::f,4>; -  #if _MSC_VER  std::wstring convert(const QString &s) { return s.toStdWString(); } @@ -55,6 +56,18 @@ std::string convert(const QString &s) { return s.toStdString(); }  #endif +int enum_to_fps(int value) +{ +    switch (value) +    { +        case fps_30:        return 30; +        case fps_60:        return 60; +        default: [[fallthrough]]; +        case fps_default:   return 0; +    } +} + +  template<class F>  struct OnScopeExit  { @@ -67,28 +80,38 @@ struct OnScopeExit  }; -float sigmoid(float x) +CamIntrinsics make_intrinsics(const cv::Mat& img, const Settings& settings)  { -    return 1.f/(1.f + std::exp(-x)); -} - +    const int w = img.cols, h = img.rows; +    const double diag_fov = settings.fov * M_PI / 180.; +    const double fov_w = 2.*atan(tan(diag_fov/2.)/sqrt(1. + h/(double)w * h/(double)w)); +    const double fov_h = 2.*atan(tan(diag_fov/2.)/sqrt(1. + w/(double)h * w/(double)h)); +    const double focal_length_w = 1. / tan(.5 * fov_w); +    const double focal_length_h = 1. / tan(.5 * fov_h); +    /*  a +      ______  <--- here is sensor area +      |    / +      |   / +    f |  / +      | /  2 x angle is the fov +      |/ +        <--- here is the hole of the pinhole camera + +      So, a / f = tan(fov / 2) +      => f = a/tan(fov/2) +      What is a? +      1 if we define f in terms of clip space where the image plane goes from -1 to 1. Because a is the half-width. +    */ -cv::Rect make_crop_rect_for_aspect(const cv::Size &size, int aspect_w, int aspect_h) -{ -    auto [w, h] = size; -    if ( w*aspect_h > aspect_w*h ) -    { -        // Image is too wide -        const int new_w = (aspect_w*h)/aspect_h; -        return cv::Rect((w - new_w)/2, 0, new_w, h); -    } -    else -    { -        const int new_h = (aspect_h*w)/aspect_w; -        return cv::Rect(0, (h - new_h)/2, w, new_h); -    } +    return { +        (float)focal_length_w, +        (float)focal_length_h, +        (float)fov_w, +        (float)fov_h +    };  } +  cv::Rect make_crop_rect_multiple_of(const cv::Size &size, int multiple)  {      const int new_w = (size.width / multiple) * multiple; @@ -111,20 +134,6 @@ cv::Rect_<T> squarize(const cv::Rect_<T> &r)  template<class T> -cv::Point_<T> as_point(const cv::Size_<T>& s) -{ -    return { s.width, s.height }; -} - - -template<class T> -cv::Size_<T> as_size(const cv::Point_<T>& p) -{ -    return { p.x, p.y }; -} - - -template<class T>  cv::Rect_<T> expand(const cv::Rect_<T>& r, T factor)  {      // xnew = l+.5*w - w*f*0.5 = l + .5*(w - new_w) @@ -145,110 +154,40 @@ cv::Rect_<T> ewa_filter(const cv::Rect_<T>& last, const cv::Rect_<T>& current, T  } -cv::Rect2f unnormalize(const cv::Rect2f &r, int h, int w) -{ -    auto unnorm = [](float x) -> float { return 0.5*(x+1); }; -    auto tl = r.tl(); -    auto br = r.br(); -    auto x0 = unnorm(tl.x)*w; -    auto y0 = unnorm(tl.y)*h; -    auto x1 = unnorm(br.x)*w; -    auto y1 = unnorm(br.y)*h; -    return { -        x0, y0, x1-x0, y1-y0 -    }; -} - -cv::Point2f normalize(const cv::Point2f &p, int h, int w) -{ -    return { -        p.x/w*2.f-1.f, -        p.y/h*2.f-1.f -    }; -} - - -mat33 rotation_from_two_vectors(const vec3 &a, const vec3 &b) -{ -    vec3 axis = a.cross(b); -    const float len_a = cv::norm(a); -    const float len_b = cv::norm(b); -    const float len_axis = cv::norm(axis); -    const float sin_angle = std::clamp(len_axis / (len_a * len_b), -1.f, 1.f); -    const float angle = std::asin(sin_angle); -    axis *= angle/(1.e-12 + len_axis); -    mat33 out; -    cv::Rodrigues(axis, out); -    return out; -} - - -// Computes correction due to head being off screen center. -// x, y: In screen space, i.e. in [-1,1] -// focal_length_x: In screen space -mat33 compute_rotation_correction(const cv::Point2f &p, float focal_length_x) -{ -    return rotation_from_two_vectors( -        {1.f,0.f,0.f},  -        {focal_length_x, p.y, p.x}); -} - - -mat33 quaternion_to_mat33(const std::array<float,4> quat) -{ -    mat33 m; -    const float w = quat[0]; -    const float i = quat[1]; -    const float j = quat[2]; -    const float k = quat[3]; -    m(0,0) = 1.f - 2.f*(j*j + k*k); -    m(1,0) =       2.f*(i*j + k*w); -    m(2,0) =       2.f*(i*k - j*w); -    m(0,1) =       2.f*(i*j - k*w); -    m(1,1) = 1.f - 2.f*(i*i + k*k); -    m(2,1) =       2.f*(j*k + i*w); -    m(0,2) =       2.f*(i*k + j*w); -    m(1,2) =       2.f*(j*k - i*w); -    m(2,2) = 1.f - 2.f*(i*i + j*j); -    return m; -} - - -vec3 rotate_vec(const quat& q, const vec3& p) +cv::Vec3f image_to_world(float x, float y, float size, float reference_size_in_mm, const cv::Size2i& image_size, const CamIntrinsics& intrinsics)  { -    const float qw = q[0]; -    const float qi = q[1]; -    const float qj = q[2]; -    const float qk = q[3]; -    const float pi = p[0]; -    const float pj = p[1]; -    const float pk = p[2]; -    const quat tmp{ -                        - qi*pi - qj*pj - qk*pk, -        qw*pi                   + qj*pk - qk*pj, -        qw*pj - qi*pk                   + qk*pi, -        qw*pk + qi*pj - qj*pi                   -    }; -    const vec3 out { -        -tmp[0]*qi + tmp[1]*qw - tmp[2]*qk + tmp[3]*qj, -        -tmp[0]*qj + tmp[1]*qk + tmp[2]*qw - tmp[3]*qi, -        -tmp[0]*qk - tmp[1]*qj + tmp[2]*qi + tmp[3]*qw -    }; -    return out; -} - - -vec3 image_to_world(float x, float y, float size, float reference_size_in_mm, const cv::Size2i& image_size, const CamIntrinsics& intrinsics) -{ -    // Compute the location the network outputs in 3d space. -    const float xpos = -(intrinsics.focal_length_w * image_size.width * 0.5f) / size * reference_size_in_mm; +    /* +    Compute the location the network outputs in 3d space. +     +       hhhhhh  <- head size (meters) +      \      | ----------------------- +       \     |                         \ +        \    |                          | +         \   |                          |- x (meters) +          ____ <- face.size / width     | +           \ |  |                       | +            \|  |- focal length        / +               ------------------------ +            ------------------------------------------------>> z direction +        z/x = zi / f +        zi = image position +        z = world position +        f = focal length + +        We can also do deltas: +        dz / x = dzi / f +        => x = dz / dzi * f +        which means we can compute x from the head size (dzi) if we assume some reference size (dz). +    */ +    const float head_size_vertical = 2.f*size;  // Size from the model is more like half the real vertical size of a human head. +    const float xpos = -(intrinsics.focal_length_w * image_size.width * 0.5f) / head_size_vertical * reference_size_in_mm;      const float zpos = (x / image_size.width * 2.f - 1.f) * xpos / intrinsics.focal_length_w;      const float ypos = (y / image_size.height * 2.f - 1.f) * xpos / intrinsics.focal_length_h;      return {xpos, ypos, zpos};  } -vec2 world_to_image(const vec3& pos, const cv::Size2i& image_size, const CamIntrinsics& intrinsics) +vec2 world_to_image(const cv::Vec3f& pos, const cv::Size2i& image_size, const CamIntrinsics& intrinsics)  {      const float xscr = pos[2] / pos[0] * intrinsics.focal_length_w;      const float yscr = pos[1] / pos[0] * intrinsics.focal_length_h; @@ -258,7 +197,7 @@ vec2 world_to_image(const vec3& pos, const cv::Size2i& image_size, const CamIntr  } -quat image_to_world(quat q) +cv::Quatf image_to_world(cv::Quatf q)  {      std::swap(q[1], q[3]);      q[1] = -q[1]; @@ -267,338 +206,73 @@ quat image_to_world(quat q)      return q;  } -quat world_to_image(quat q) -{ -    // It's its own inverse. -    return image_to_world(q); -} - -// Intersection over union. A value between 0 and 1 which measures the match between the bounding boxes. -template<class T> -T iou(const cv::Rect_<T> &a, const cv::Rect_<T> &b) -{ -    auto i = a & b; -    return double{i.area()} / (a.area()+b.area()-i.area()); -} - -// Returns width and height of the input tensor, or throws. -// Expects the model to take one tensor as input that must -// have the shape B x C x H x W, where B=C=1. -cv::Size get_input_image_shape(const Ort::Session &session) -{ -    if (session.GetInputCount() < 1) -        throw std::invalid_argument("Model must take at least one input tensor"); -    const std::vector<std::int64_t> shape =  -        session.GetInputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape(); -    if (shape.size() != 4) -        throw std::invalid_argument("Model takes the input tensor in the wrong shape"); -    return { static_cast<int>(shape[3]), static_cast<int>(shape[2]) }; -} - -Ort::Value create_tensor(const Ort::TypeInfo& info, Ort::Allocator& alloc) +cv::Point2f normalize(const cv::Point2f &p, int h, int w)  { -    const auto shape = info.GetTensorTypeAndShapeInfo().GetShape(); -    auto t = Ort::Value::CreateTensor<float>( -        alloc, shape.data(), shape.size()); -    memset(t.GetTensorMutableData<float>(), 0, sizeof(float)*info.GetTensorTypeAndShapeInfo().GetElementCount()); -    return t; +    return { +        p.x/w*2.f-1.f, +        p.y/h*2.f-1.f +    };  } - -int enum_to_fps(int value) +cv::Quatf rotation_from_two_vectors(const vec3 &a, const vec3 &b)  { -    switch (value) +    // |axis| = |a| * |b| * sin(alpha) +    const vec3 axis = a.cross(b); +    // dot = |a|*|b|*cos(alpha) +    const float dot = a.dot(b); +    const float len = cv::norm(axis); +    vec3 normed_axis = axis / len; +    float angle = std::atan2(len, dot); +    if (!(std::isfinite(normed_axis[0]) && std::isfinite(normed_axis[1]) && std::isfinite(normed_axis[2])))      { -        case fps_30:        return 30; -        case fps_60:        return 60; -        default: [[fallthrough]]; -        case fps_default:   return 0; +        angle = 0.f; +        normed_axis = vec3{1.,0.,0.};      } +    return cv::Quatf::createFromAngleAxis(angle, normed_axis);  } -Localizer::Localizer(Ort::MemoryInfo &allocator_info, Ort::Session &&session) : -    session_{std::move(session)}, -    scaled_frame_(INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH, CV_8U), -    input_mat_(INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH, CV_32F) -{ -    // Only works when input_mat does not reallocated memory ...which it should not. -    // Non-owning memory reference to input_mat? -    // Note: shape = (bach x channels x h x w) -    const std::int64_t input_shape[4] = { 1, 1, INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH }; -    input_val_ = Ort::Value::CreateTensor<float>(allocator_info, input_mat_.ptr<float>(0), input_mat_.total(), input_shape, 4); - -    const std::int64_t output_shape[2] = { 1, 5 }; -    output_val_ = Ort::Value::CreateTensor<float>(allocator_info, results_.data(), results_.size(), output_shape, 2); -} - - -std::pair<float, cv::Rect2f> Localizer::run( -    const cv::Mat &frame) -{ -    auto p = input_mat_.ptr(0); - -    cv::resize(frame, scaled_frame_, { INPUT_IMG_WIDTH, INPUT_IMG_HEIGHT }, 0, 0, cv::INTER_AREA); -    scaled_frame_.convertTo(input_mat_, CV_32F, 1./255., -0.5); - -    assert (input_mat_.ptr(0) == p); -    assert (!input_mat_.empty() && input_mat_.isContinuous()); -    assert (input_mat_.cols == INPUT_IMG_WIDTH && input_mat_.rows == INPUT_IMG_HEIGHT); - -    const char* input_names[] = {"x"}; -    const char* output_names[] = {"logit_box"}; - -    Timer t; t.start(); - -    session_.Run(Ort::RunOptions{nullptr}, input_names, &input_val_, 1, output_names, &output_val_, 1); - -    last_inference_time_ = t.elapsed_ms(); - -    const cv::Rect2f roi = unnormalize(cv::Rect2f{ -        results_[1], -        results_[2], -        results_[3]-results_[1], // Width -        results_[4]-results_[2] // Height -    }, frame.rows, frame.cols); -    const float score = sigmoid(results_[0]); - -    return { score, roi }; -} - - -double Localizer::last_inference_time_millis() const +// Computes correction due to head being off screen center. +cv::Quatf compute_rotation_correction(const cv::Point3f& p)  { -    return last_inference_time_; +    return rotation_from_two_vectors( +        {-1.f,0.f,0.f}, p);  } -PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&_session)  -    : model_version_{_session.GetModelMetadata().GetVersion()} -    , session_{std::move(_session)} -    , allocator_{session_, allocator_info} +// Intersection over union. A value between 0 and 1 which measures the match between the bounding boxes. +template<class T> +T iou(const cv::Rect_<T> &a, const cv::Rect_<T> &b)  { -    using namespace std::literals::string_literals; - -    if (session_.GetOutputCount() < 2) -        throw std::runtime_error("Invalid Model: must have at least two outputs"); - -    // WARNING UB .. but still ... -    // If the model was saved without meta data, it seems the version field is uninitialized. -    // In that case reading from it is UB. However, we will just get same arbitrary number -    // which is hopefully different from the numbers used by models where the version is set. -    // I.e., this is what happended in practice so far. -    if (model_version_ != 2) -        model_version_ = 1; - -    const cv::Size input_image_shape = get_input_image_shape(session_); - -    scaled_frame_ = cv::Mat(input_image_shape, CV_8U, cv::Scalar(0)); -    input_mat_ = cv::Mat(input_image_shape, CV_32F, cv::Scalar(0.f)); - -    { -        const std::int64_t input_shape[4] = { 1, 1, input_image_shape.height, input_image_shape.width }; -        input_val_.push_back( -            Ort::Value::CreateTensor<float>(allocator_info, input_mat_.ptr<float>(0), input_mat_.total(), input_shape, 4)); -    } - -    { -        const std::int64_t output_shape[2] = { 1, 3 }; -        output_val_.push_back(Ort::Value::CreateTensor<float>( -            allocator_info, &output_coord_[0], output_coord_.rows, output_shape, 2)); -    } - -    { -        const std::int64_t output_shape[2] = { 1, 4 }; -        output_val_.push_back(Ort::Value::CreateTensor<float>( -            allocator_info, &output_quat_[0], output_quat_.rows, output_shape, 2)); -    } - -    size_t num_regular_outputs = 2; - -    if (session_.GetOutputCount() >= 3 && "box"s == session_.GetOutputName(2, allocator_)) -    { -        const std::int64_t output_shape[2] = { 1, 4 }; -        output_val_.push_back(Ort::Value::CreateTensor<float>( -            allocator_info, &output_box_[0], output_box_.rows, output_shape, 2)); -        ++num_regular_outputs; -        qDebug() << "Note: Legacy model output for face ROI is currently ignored"; -    } - -    num_recurrent_states_ = session_.GetInputCount()-1; -    if (session_.GetOutputCount()-num_regular_outputs != num_recurrent_states_) -        throw std::runtime_error("Invalid Model: After regular inputs and outputs the model must have equal number of inputs and outputs for tensors holding hidden states of recurrent layers."); - -    // Create tensors for recurrent state -    for (size_t i = 0; i < num_recurrent_states_; ++i) -    { -        const auto& input_info = session_.GetInputTypeInfo(1+i); -        const auto& output_info = session_.GetOutputTypeInfo(num_regular_outputs+i); -        if (input_info.GetTensorTypeAndShapeInfo().GetShape() !=  -            output_info.GetTensorTypeAndShapeInfo().GetShape()) -            throw std::runtime_error("Invalid Model: Tensors for recurrent hidden states should have same shape on intput and output"); -        input_val_.push_back(create_tensor(input_info, allocator_)); -        output_val_.push_back(create_tensor(output_info, allocator_)); -    } - -    for (size_t i = 0; i < session_.GetInputCount(); ++i) -    { -        input_names_.push_back(session_.GetInputName(i, allocator_)); -    } -    for (size_t i = 0; i < session_.GetOutputCount(); ++i) -    { -        output_names_.push_back(session_.GetOutputName(i, allocator_)); -    } - -    qDebug() << "Model inputs: " << session_.GetInputCount() << ", outputs: " << session_.GetOutputCount() << ", recurrent states: " << num_recurrent_states_; - -    assert (input_names_.size() == input_val_.size()); -    assert (output_names_.size() == output_val_.size()); +    auto i = a & b; +    return double{i.area()} / (a.area()+b.area()-i.area());  } -int PoseEstimator::find_input_intensity_90_pct_quantile() const +class GuardedThreadCountSwitch  { -    const int channels[] = { 0 }; -    const int hist_size[] = { 255 }; -    float range[] = { 0, 256 }; -    const float* ranges[] = { range }; -    cv::Mat hist; -    cv::calcHist(&scaled_frame_, 1,  channels, cv::Mat(), hist, 1, hist_size, ranges, true, false); -    int gray_level = 0; -    const int num_pixels_quantile = scaled_frame_.total()*0.9f; -    int num_pixels_accum = 0; -    for (int i=0; i<hist_size[0]; ++i) -    { -        num_pixels_accum += hist.at<float>(i); -        if (num_pixels_accum > num_pixels_quantile) +    int old_num_threads_cv_ = 1; +    int old_num_threads_omp_ = 1; +    public: +        GuardedThreadCountSwitch(int num_threads)          { -            gray_level = i; -            break; +            old_num_threads_cv_ = cv::getNumThreads(); +            old_num_threads_omp_ = omp_get_num_threads(); +            omp_set_num_threads(num_threads); +            cv::setNumThreads(num_threads); +        } +             +        ~GuardedThreadCountSwitch() +        { +            omp_set_num_threads(old_num_threads_omp_); +            cv::setNumThreads(old_num_threads_cv_);          } -    } -    return gray_level; -} - - -std::optional<PoseEstimator::Face> PoseEstimator::run( -    const cv::Mat &frame, const cv::Rect &box) -{ -    cv::Mat cropped; - -    const int patch_size = std::max(box.width, box.height)*1.05; -    const cv::Point2f patch_center = { -        std::clamp<float>(box.x + 0.5f*box.width, 0.f, frame.cols), -        std::clamp<float>(box.y + 0.5f*box.height, 0.f, frame.rows) -    }; -    cv::getRectSubPix(frame, {patch_size, patch_size}, patch_center, cropped); - -    // Will get failure if patch_center is outside image boundariesettings. -    // Have to catch this case. -    if (cropped.rows != patch_size || cropped.cols != patch_size) -        return {}; - -    auto p = input_mat_.ptr(0); - -    cv::resize(cropped, scaled_frame_, scaled_frame_.size(), 0, 0, cv::INTER_AREA); - -    // Automatic brightness amplification. -    const int brightness = find_input_intensity_90_pct_quantile(); -    const double alpha = brightness<127 ? 0.5/std::max(5,brightness) : 1./255; -    const double beta = -0.5; - -    scaled_frame_.convertTo(input_mat_, CV_32F, alpha, beta); - -    assert (input_mat_.ptr(0) == p); -    assert (!input_mat_.empty() && input_mat_.isContinuous()); - - -    Timer t; t.start(); - -    try -    { -        session_.Run( -            Ort::RunOptions{ nullptr },  -            input_names_.data(),  -            input_val_.data(),  -            input_val_.size(),  -            output_names_.data(),  -            output_val_.data(),  -            output_val_.size()); -    } -    catch (const Ort::Exception &e) -    { -        qDebug() << "Failed to run the model: " << e.what(); -        return {}; -    } - -    for (size_t i = 0; i<num_recurrent_states_; ++i) -    { -        // Next step, the current output becomes the input. -        // Thus we realize the recurrent connection. -        // Only swaps the internal pointers. There is no copy of data. -        std::swap( -            output_val_[output_val_.size()-num_recurrent_states_+i], -            input_val_[input_val_.size()-num_recurrent_states_+i]); -    } - -    // FIXME: Execution time fluctuates wildly. 19 to 26 msec. Why? -    //        The instructions are always the same. Maybe a memory allocation -    //        issue. The ONNX api suggests that tensor are allocated in an -    //        arena. Does that matter? Maybe the issue is something else? - -    last_inference_time_ = t.elapsed_ms(); - -    // Perform coordinate transformation. -    // From patch-local normalized in [-1,1] to -    // frame unnormalized pixel coordinatesettings. - -    const cv::Point2f center = patch_center +  -        (0.5f*patch_size)*cv::Point2f{output_coord_[0], output_coord_[1]}; - -    const float size = patch_size*0.5f*output_coord_[2]; - -    // Following Eigen which uses quat components in the order w, x, y, z. -    quat rotation = {  -        output_quat_[3],  -        output_quat_[0],  -        output_quat_[1],  -        output_quat_[2] }; - -    if (model_version_ < 2) -    { -        // Due to a change in coordinate conventions -        rotation = world_to_image(rotation); -    } - -    const cv::Rect2f outbox = { -        patch_center.x + (0.5f*patch_size)*output_box_[0], -        patch_center.y + (0.5f*patch_size)*output_box_[1], -        0.5f*patch_size*(output_box_[2]-output_box_[0]), -        0.5f*patch_size*(output_box_[3]-output_box_[1]) -    }; - -    return std::optional<Face>({ -        rotation, outbox, center, size -    }); -} - - -cv::Mat PoseEstimator::last_network_input() const -{ -    assert(!input_mat_.empty()); -    cv::Mat ret; -    input_mat_.convertTo(ret, CV_8U, 255., 127.); -    cv::cvtColor(ret, ret, cv::COLOR_GRAY2RGB); -    return ret; -} - -double PoseEstimator::last_inference_time_millis() const -{ -    return last_inference_time_; -} +        GuardedThreadCountSwitch(const GuardedThreadCountSwitch&) = delete; +        GuardedThreadCountSwitch& operator=(const GuardedThreadCountSwitch&) = delete; +};  bool NeuralNetTracker::detect() @@ -645,6 +319,8 @@ bool NeuralNetTracker::detect()      if (!last_roi_)      { +        // Last iteration the tracker failed to generate a trustworthy  +        // roi and the localizer also cannot find a face.          draw_gizmos({}, {});          return false;      } @@ -655,37 +331,28 @@ bool NeuralNetTracker::detect()      if (!face)      {          last_roi_.reset(); -        draw_gizmos(*face, {}); +        draw_gizmos({}, {});          return false;      } -    { -        // Here: compute ROI from head size estimate. This helps make the size prediction more -        //       invariant to mouth opening. The tracking can be lost more often at extreme -        //       rotations, depending on the implementation details. The code down here has -        //       been tweaked so that it works pretty well. -        // In old behaviour  ROI is taken from the model outputs -        const vec3 offset = rotate_vec(face->rotation, vec3{0.f, 0.1f*face->size, face->size*0.3f}); -        const float halfsize = face->size/float(settings_.roi_zoom); -        face->box = cv::Rect2f( -            face->center.x + offset[0] - halfsize, -            face->center.y + offset[1] - halfsize, -            halfsize*2.f, -            halfsize*2.f -        ); -    } - -    last_roi_ = ewa_filter(*last_roi_, face->box, float(settings_.roi_filter_alpha)); - -    Affine pose = compute_pose(*face); +    cv::Rect2f roi = expand(face->box, (float)settings_.roi_zoom); -    draw_gizmos(*face, pose); +    last_roi_ = ewa_filter(*last_roi_, roi, float(settings_.roi_filter_alpha)); +    QuatPose pose = compute_filtered_pose(*face); +    last_pose_ = pose; +     +    Affine pose_affine = {  +        pose.rot.toRotMat3x3(cv::QUAT_ASSUME_UNIT),  +        pose.pos }; +          {          QMutexLocker lck(&mtx_); -        this->pose_ = pose; +        last_pose_affine_ = pose_affine;      } +    draw_gizmos(*face, last_pose_affine_); +      return true;  } @@ -697,168 +364,74 @@ void NeuralNetTracker::draw_gizmos(      if (!is_visible_)          return; -    preview_.draw_gizmos(face, pose, last_roi_, last_localizer_roi_, world_to_image(pose.t, grayscale_.size(), intrinsics_)); +    preview_.draw_gizmos( +        face,  +        last_roi_,  +        last_localizer_roi_,  +        world_to_image(pose.t, grayscale_.size(), intrinsics_));      if (settings_.show_network_input)      {          cv::Mat netinput = poseestimator_->last_network_input();          preview_.overlay_netinput(netinput);      } - -    //preview_.draw_fps(fps, last_inference_time);  } -Affine NeuralNetTracker::compute_pose(const PoseEstimator::Face &face) const +QuatPose NeuralNetTracker::transform_to_world_pose(const cv::Quatf &face_rotation, const cv::Point2f& face_xy, const float face_size) const  { -    // Compute the location the network outputs in 3d space. - -    const mat33 rot_correction = compute_rotation_correction( -        normalize(face.center, grayscale_.rows, grayscale_.cols), -        intrinsics_.focal_length_w); - -    const mat33 m = rot_correction * quaternion_to_mat33( -        image_to_world(face.rotation)); - -    /* -          -       hhhhhh  <- head size (meters) -      \      | ----------------------- -       \     |                         \ -        \    |                          | -         \   |                          |- tz (meters) -          ____ <- face.size / width     | -           \ |  |                       | -            \|  |- focal length        / -               ------------------------ -    */ -      const vec3 face_world_pos = image_to_world( -        face.center.x, face.center.y, face.size, HEAD_SIZE_MM, +        face_xy.x, face_xy.y, face_size, HEAD_SIZE_MM,          grayscale_.size(),          intrinsics_); +    const cv::Quatf rot_correction = compute_rotation_correction( +        face_world_pos); +     +    cv::Quatf rot = rot_correction * image_to_world(face_rotation); +      // But this is in general not the location of the rotation joint in the neck. -    // So we need an extra offset. Which we determine by solving +    // So we need an extra offset. Which we determine by computing      // z,y,z-pos = head_joint_loc + R_face * offset -    const vec3 pos = face_world_pos -        + m * vec3{ -            static_cast<float>(settings_.offset_fwd), -            static_cast<float>(settings_.offset_up), -            static_cast<float>(settings_.offset_right)}; +    const vec3 local_offset = vec3{ +        static_cast<float>(settings_.offset_fwd), +        static_cast<float>(settings_.offset_up), +        static_cast<float>(settings_.offset_right)}; +    const vec3 offset = rotate(rot, local_offset); +    const vec3 pos = face_world_pos + offset; -    return { m, pos }; +    return { rot, pos };  } -void Preview::init(const cv_video_widget& widget) +QuatPose NeuralNetTracker::compute_filtered_pose(const PoseEstimator::Face &face)  { -    auto [w,h] = widget.preview_size(); -    preview_size_ = { w, h }; -} - - -void Preview::copy_video_frame(const cv::Mat& frame) -{ -    cv::Rect roi = make_crop_rect_for_aspect(frame.size(), preview_size_.width, preview_size_.height); -     -    cv::resize(frame(roi), preview_image_, preview_size_, 0, 0, cv::INTER_NEAREST); - -    offset_ = { (float)-roi.x, (float)-roi.y }; -    scale_ = float(preview_image_.cols) / float(roi.width); -} - - -void Preview::draw_gizmos( -    const std::optional<PoseEstimator::Face> &face, -    const Affine& pose, -    const std::optional<cv::Rect2f>& last_roi, -    const std::optional<cv::Rect2f>& last_localizer_roi, -    const cv::Point2f& neckjoint_position) -{ -    if (preview_image_.empty()) -        return; - -    if (last_roi)  +    if (fps_ > 0.01 && last_pose_ && poseestimator_->has_uncertainty())      { -        const int col = 255; -        cv::rectangle(preview_image_, transform(*last_roi), cv::Scalar(0, col, 0), /*thickness=*/1); -    } -    if (last_localizer_roi) -    { -        const int col = 255; -        cv::rectangle(preview_image_, transform(*last_localizer_roi), cv::Scalar(col, 0, 255-col), /*thickness=*/1); +        auto image2world = [this](const cv::Quatf &face_rotation, const cv::Point2f& face_xy, const float face_size) { +                return this->transform_to_world_pose(face_rotation, face_xy, face_size); }; +         +        return apply_filter( +            face,  +            *last_pose_,  +            1./fps_,  +            std::move(image2world), +            FiltParams{ +                float(settings_.deadzone_hardness), +                float(settings_.deadzone_size) +            });      } - -    if (face) +    else      { -        if (face->size>=1.f) -            cv::circle(preview_image_, static_cast<cv::Point>(transform(face->center)), int(transform(face->size)), cv::Scalar(255,255,255), 2); -        cv::circle(preview_image_, static_cast<cv::Point>(transform(face->center)), 3, cv::Scalar(255,255,255), -1); - -        auto draw_coord_line = [&](int i, const cv::Scalar& color) -        { -            const float vx = -pose.R(2,i); -            const float vy = -pose.R(1,i); -            static constexpr float len = 100.f; -            cv::Point q = face->center + len*cv::Point2f{vx, vy}; -            cv::line(preview_image_, static_cast<cv::Point>(transform(face->center)), static_cast<cv::Point>(transform(q)), color, 2); -        }; -        draw_coord_line(0, {0, 0, 255}); -        draw_coord_line(1, {0, 255, 0}); -        draw_coord_line(2, {255, 0, 0}); - -        // Draw the computed joint position -        auto xy = transform(neckjoint_position); -        cv::circle(preview_image_, cv::Point(xy.x,xy.y), 5, cv::Scalar(0,0,255), -1); +        return transform_to_world_pose(face.rotation, face.center, face.size);      }  } -void Preview::overlay_netinput(const cv::Mat& netinput) -{ -    if (netinput.empty()) -        return; - -    const int w = std::min(netinput.cols, preview_image_.cols); -    const int h = std::min(netinput.rows, preview_image_.rows); -    cv::Rect roi(0, 0, w, h); -    netinput(roi).copyTo(preview_image_(roi)); -} - -void Preview::draw_fps(double fps, double last_inference_time) -{ -    char buf[128]; -    ::snprintf(buf, sizeof(buf), "%d Hz, pose inference: %d ms", std::clamp(int(fps), 0, 9999), int(last_inference_time)); -    cv::putText(preview_image_, buf, cv::Point(10, preview_image_.rows-10), cv::FONT_HERSHEY_PLAIN, 1, cv::Scalar(0, 255, 0), 1); -} - - -void Preview::copy_to_widget(cv_video_widget& widget) -{ -    if (preview_image_.rows > 0) -        widget.update_image(preview_image_); -} - - -cv::Rect2f Preview::transform(const cv::Rect2f& r) const -{ -    return { (r.x - offset_.x)*scale_, (r.y - offset_.y)*scale_, r.width*scale_, r.height*scale_ }; -} - -cv::Point2f Preview::transform(const cv::Point2f& p) const -{ -    return { (p.x - offset_.x)*scale_ , (p.y - offset_.y)*scale_ }; -} - -float Preview::transform(float s) const -{ -    return s * scale_; -} -  NeuralNetTracker::NeuralNetTracker()  {      opencv_init(); +    neuralnet_tracker_tests::run();  } @@ -921,6 +494,7 @@ bool NeuralNetTracker::load_and_initialize_model()              << e.what();          return false;      } +      return true;  } @@ -960,46 +534,6 @@ bool NeuralNetTracker::open_camera()  } -void NeuralNetTracker::set_intrinsics() -{ -    const int w = grayscale_.cols, h = grayscale_.rows; -    const double diag_fov = settings_.fov * M_PI / 180.; -    const double fov_w = 2.*atan(tan(diag_fov/2.)/sqrt(1. + h/(double)w * h/(double)w)); -    const double fov_h = 2.*atan(tan(diag_fov/2.)/sqrt(1. + w/(double)h * w/(double)h)); -    const double focal_length_w = 1. / tan(.5 * fov_w); -    const double focal_length_h = 1. / tan(.5 * fov_h); - -    intrinsics_.fov_h = fov_h; -    intrinsics_.fov_w = fov_w; -    intrinsics_.focal_length_w = focal_length_w; -    intrinsics_.focal_length_h = focal_length_h; -} - - -class GuardedThreadCountSwitch -{ -    int old_num_threads_cv_ = 1; -    int old_num_threads_omp_ = 1; -    public: -        GuardedThreadCountSwitch(int num_threads) -        { -            old_num_threads_cv_ = cv::getNumThreads(); -            old_num_threads_omp_ = omp_get_num_threads(); -            omp_set_num_threads(num_threads); -            cv::setNumThreads(num_threads); -        } -             -        ~GuardedThreadCountSwitch() -        { -            omp_set_num_threads(old_num_threads_omp_); -            cv::setNumThreads(old_num_threads_cv_); -        } - -        GuardedThreadCountSwitch(const GuardedThreadCountSwitch&) = delete; -        GuardedThreadCountSwitch& operator=(const GuardedThreadCountSwitch&) = delete; -}; - -  void NeuralNetTracker::run()  {      preview_.init(*video_widget_); @@ -1055,7 +589,7 @@ void NeuralNetTracker::run()              }          } -        set_intrinsics(); +        intrinsics_ = make_intrinsics(grayscale_, settings_);          detect(); @@ -1113,7 +647,7 @@ void NeuralNetTracker::data(double *data)      Affine tmp = [&]()      {          QMutexLocker lck(&mtx_); -        return pose_; +        return last_pose_affine_;      }();      const auto& mx = tmp.R.col(0); @@ -1140,7 +674,7 @@ void NeuralNetTracker::data(double *data)  Affine NeuralNetTracker::pose()  {      QMutexLocker lck(&mtx_); -    return pose_; +    return last_pose_affine_;  }  std::tuple<cv::Size,double, double> NeuralNetTracker::stats() const diff --git a/tracker-neuralnet/ftnoir_tracker_neuralnet.h b/tracker-neuralnet/ftnoir_tracker_neuralnet.h index 9b481186..9e0374da 100644 --- a/tracker-neuralnet/ftnoir_tracker_neuralnet.h +++ b/tracker-neuralnet/ftnoir_tracker_neuralnet.h @@ -7,6 +7,11 @@  #pragma once +#include "ui_neuralnet-trackercontrols.h" +#include "model_adapters.h" +#include "deadzone_filter.h" +#include "preview.h" +  #include "options/options.hpp"  #include "api/plugin-api.hpp"  #include "cv/video-widget.hpp" @@ -27,13 +32,9 @@  #include <cinttypes>  #include <array> -#include <onnxruntime_cxx_api.h> -  #include <opencv2/core.hpp> -#include <opencv2/core/types.hpp>  #include <opencv2/imgproc.hpp> -#include "ui_neuralnet-trackercontrols.h"  namespace neuralnet_tracker_ns  { @@ -81,6 +82,8 @@ struct Settings : opts {      value<bool> use_mjpeg { b, "use-mjpeg", false };      value<int> num_threads { b, "num-threads", 1 };      value<int> resolution { b, "force-resolution", 0 }; +    value<double> deadzone_size { b, "deadzone-size", 1. }; +    value<double> deadzone_hardness { b, "deadzone-hardness", 1.5 };      Settings();  }; @@ -94,101 +97,6 @@ struct CamIntrinsics  }; -class Localizer -{ -    public: -        Localizer(Ort::MemoryInfo &allocator_info, -                    Ort::Session &&session); -         -        // Returns bounding wrt image coordinate of the input image -        // The preceeding float is the score for being a face normalized to [0,1]. -        std::pair<float, cv::Rect2f> run( -            const cv::Mat &frame); - -        double last_inference_time_millis() const; -    private: -        inline static constexpr int INPUT_IMG_WIDTH = 288; -        inline static constexpr int INPUT_IMG_HEIGHT = 224; -        Ort::Session session_{nullptr}; -        // Inputs / outputs -        cv::Mat scaled_frame_{}, input_mat_{}; -        Ort::Value input_val_{nullptr}, output_val_{nullptr}; -        std::array<float, 5> results_; -        double last_inference_time_ = 0; -}; - - -class PoseEstimator -{ -    public: -        struct Face -        { -            std::array<float,4> rotation; // Quaternion, (w, x, y, z) -            cv::Rect2f box; -            cv::Point2f center; -            float size; -        }; - -        PoseEstimator(Ort::MemoryInfo &allocator_info, -                        Ort::Session &&session); -        /** Inference -        * -        * Coordinates are defined wrt. the image space of the input `frame`. -        * X goes right, Z (depth) into the image, Y points down (like pixel coordinates values increase from top to bottom) -        */ -        std::optional<Face> run(const cv::Mat &frame, const cv::Rect &box); -        // Returns an image compatible with the 'frame' image for displaying. -        cv::Mat last_network_input() const; -        double last_inference_time_millis() const; -    private: -        // Operates on the private image data members -        int find_input_intensity_90_pct_quantile() const; - -        int64_t model_version_ = 0;  // Queried meta data from the ONNX file -        Ort::Session session_{nullptr};  // ONNX's runtime context for running the model -        Ort::Allocator allocator_;   // Memory allocator for tensors -        // Inputs -        cv::Mat scaled_frame_{}, input_mat_{};  // Input. One is the original crop, the other is rescaled (?) -        std::vector<Ort::Value> input_val_;    // Tensors to put into the model -        std::vector<const char*> input_names_; // Refers to the names in the onnx model.  -        // Outputs -        cv::Vec<float, 3> output_coord_{};  // 2d Coordinate and head size output. -        cv::Vec<float, 4> output_quat_{};   //  Quaternion output -        cv::Vec<float, 4> output_box_{};    // Bounding box output -        std::vector<Ort::Value> output_val_; // Tensors to put the model outputs in. -        std::vector<const char*> output_names_; // Refers to the names in the onnx model.  -        size_t num_recurrent_states_ = 0; -        double last_inference_time_ = 0; -}; - - -class Preview -{ -public: -    void init(const cv_video_widget& widget); -    void copy_video_frame(const cv::Mat& frame); -    void draw_gizmos( -        const std::optional<PoseEstimator::Face> &face, -        const Affine& pose, -        const std::optional<cv::Rect2f>& last_roi, -        const std::optional<cv::Rect2f>& last_localizer_roi, -        const cv::Point2f& neckjoint_position); -    void overlay_netinput(const cv::Mat& netinput); -    void draw_fps(double fps, double last_inference_time); -    void copy_to_widget(cv_video_widget& widget); -private: -    // Transform from camera frame to preview -    cv::Rect2f transform(const cv::Rect2f& r) const; -    cv::Point2f transform(const cv::Point2f& p) const; -    float transform(float s) const; - -    cv::Mat preview_image_; -    cv::Size preview_size_ = { 0, 0 }; -    float scale_ = 1.f;   -    cv::Point2f offset_ = { 0.f, 0.f}; -}; - -  class NeuralNetTracker : protected virtual QThread, public ITracker  {      Q_OBJECT @@ -214,7 +122,10 @@ private:          const std::optional<PoseEstimator::Face> &face,          const Affine& pose);      void update_fps(double dt); -    Affine compute_pose(const PoseEstimator::Face &face) const; +    // Secretly applies filtering while computing the pose in 3d space. +    QuatPose compute_filtered_pose(const PoseEstimator::Face &face); +    // Compute the pose in 3d space taking the network outputs +    QuatPose transform_to_world_pose(const cv::Quatf &face_rotation, const cv::Point2f& face_xy, const float face_size) const;      Settings settings_;      std::optional<Localizer> localizer_; @@ -227,7 +138,7 @@ private:      std::array<cv::Mat,2> downsized_original_images_ = {}; // Image pyramid      std::optional<cv::Rect2f> last_localizer_roi_;      std::optional<cv::Rect2f> last_roi_; -    static constexpr float HEAD_SIZE_MM = 200.f; +    static constexpr float HEAD_SIZE_MM = 200.f; // In the vertical. Approximately.      mutable QMutex stats_mtx_;      double fps_ = 0; @@ -238,8 +149,9 @@ private:      int num_threads_ = 1;      bool is_visible_ = true; -    QMutex mtx_; // Protects the pose -    Affine pose_; +    QMutex mtx_ = {}; // Protects the pose +    std::optional<QuatPose> last_pose_ = {}; +    Affine last_pose_affine_ = {};      Preview preview_;      std::unique_ptr<cv_video_widget> video_widget_; @@ -288,6 +200,15 @@ class NeuralNetMetadata : public Metadata  } // neuralnet_tracker_ns + +namespace neuralnet_tracker_tests +{ + +void run(); + +} + +  using neuralnet_tracker_ns::NeuralNetTracker;  using neuralnet_tracker_ns::NeuralNetDialog;  using neuralnet_tracker_ns::NeuralNetMetadata; diff --git a/tracker-neuralnet/model_adapters.cpp b/tracker-neuralnet/model_adapters.cpp new file mode 100644 index 00000000..af599321 --- /dev/null +++ b/tracker-neuralnet/model_adapters.cpp @@ -0,0 +1,433 @@ +#include "model_adapters.h" + +#include "compat/timer.hpp" + +#include <opencv2/core.hpp> +#include <opencv2/core/quaternion.hpp> +#include <opencv2/imgproc.hpp> + +#include <QDebug> + + +namespace neuralnet_tracker_ns +{ + + +float sigmoid(float x) +{ +    return 1.f/(1.f + std::exp(-x)); +} + + +// Defined in ftnoir_tracker_neuralnet.cpp +// Normally we wouldn't need it here. However ... see below. +cv::Quatf image_to_world(cv::Quatf q); + + +cv::Quatf world_to_image(cv::Quatf q) +{ +    // It's its own inverse. +    return image_to_world(q); +} + + +cv::Rect2f unnormalize(const cv::Rect2f &r, int h, int w) +{ +    auto unnorm = [](float x) -> float { return 0.5*(x+1); }; +    auto tl = r.tl(); +    auto br = r.br(); +    auto x0 = unnorm(tl.x)*w; +    auto y0 = unnorm(tl.y)*h; +    auto x1 = unnorm(br.x)*w; +    auto y1 = unnorm(br.y)*h; +    return { +        x0, y0, x1-x0, y1-y0 +    }; +} + + +// Returns width and height of the input tensor, or throws. +// Expects the model to take one tensor as input that must +// have the shape B x C x H x W, where B=C=1. +cv::Size get_input_image_shape(const Ort::Session &session) +{ +    if (session.GetInputCount() < 1) +        throw std::invalid_argument("Model must take at least one input tensor"); +    const std::vector<std::int64_t> shape =  +        session.GetInputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape(); +    if (shape.size() != 4) +        throw std::invalid_argument("Model takes the input tensor in the wrong shape"); +    return { static_cast<int>(shape[3]), static_cast<int>(shape[2]) }; +} + + +Ort::Value create_tensor(const Ort::TypeInfo& info, Ort::Allocator& alloc) +{ +    const auto shape = info.GetTensorTypeAndShapeInfo().GetShape(); +    auto t = Ort::Value::CreateTensor<float>( +        alloc, shape.data(), shape.size()); +    memset(t.GetTensorMutableData<float>(), 0, sizeof(float)*info.GetTensorTypeAndShapeInfo().GetElementCount()); +    return t; +} + + +int find_input_intensity_quantile(const cv::Mat& frame, float percentage) +{ +    const int channels[] = { 0 }; +    const int hist_size[] = { 256 }; +    float range[] = { 0, 256 }; +    const float* ranges[] = { range }; +    cv::Mat hist; +    cv::calcHist(&frame, 1,  channels, cv::Mat(), hist, 1, hist_size, ranges, true, false); +    int gray_level = 0; +    const int num_pixels_quantile = frame.total()*percentage*0.01f; +    int num_pixels_accum = 0; +    for (int i=0; i<hist_size[0]; ++i) +    { +        num_pixels_accum += hist.at<float>(i); +        if (num_pixels_accum > num_pixels_quantile) +        { +            gray_level = i; +            break; +        } +    } +    return gray_level; +} + + +// Automatic brightness adjustment. Scales brightness to lie between -.5 and 0.5, roughly. +void normalize_brightness(const cv::Mat& frame, cv::Mat& out) +{ +    const float pct = 90; + +    const int brightness = find_input_intensity_quantile(frame, pct); + +    const double alpha = brightness<127 ? (pct/100.f*0.5f/std::max(5,brightness)) : 1./255; +    const double beta = -0.5; + +    frame.convertTo(out, CV_32F, alpha, beta); +} + + + +Localizer::Localizer(Ort::MemoryInfo &allocator_info, Ort::Session &&session) : +    session_{std::move(session)}, +    scaled_frame_(INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH, CV_8U), +    input_mat_(INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH, CV_32F) +{ +    // Only works when input_mat does not reallocated memory ...which it should not. +    // Non-owning memory reference to input_mat? +    // Note: shape = (bach x channels x h x w) +    const std::int64_t input_shape[4] = { 1, 1, INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH }; +    input_val_ = Ort::Value::CreateTensor<float>(allocator_info, input_mat_.ptr<float>(0), input_mat_.total(), input_shape, 4); + +    const std::int64_t output_shape[2] = { 1, 5 }; +    output_val_ = Ort::Value::CreateTensor<float>(allocator_info, results_.data(), results_.size(), output_shape, 2); +} + + +std::pair<float, cv::Rect2f> Localizer::run( +    const cv::Mat &frame) +{ +    auto p = input_mat_.ptr(0); + +    cv::resize(frame, scaled_frame_, { INPUT_IMG_WIDTH, INPUT_IMG_HEIGHT }, 0, 0, cv::INTER_AREA); +    scaled_frame_.convertTo(input_mat_, CV_32F, 1./255., -0.5); + +    assert (input_mat_.ptr(0) == p); +    assert (!input_mat_.empty() && input_mat_.isContinuous()); +    assert (input_mat_.cols == INPUT_IMG_WIDTH && input_mat_.rows == INPUT_IMG_HEIGHT); + +    const char* input_names[] = {"x"}; +    const char* output_names[] = {"logit_box"}; + +    Timer t; t.start(); + +    session_.Run(Ort::RunOptions{nullptr}, input_names, &input_val_, 1, output_names, &output_val_, 1); + +    last_inference_time_ = t.elapsed_ms(); + +    const cv::Rect2f roi = unnormalize(cv::Rect2f{ +        results_[1], +        results_[2], +        results_[3]-results_[1], // Width +        results_[4]-results_[2] // Height +    }, frame.rows, frame.cols); +    const float score = sigmoid(results_[0]); + +    return { score, roi }; +} + + +double Localizer::last_inference_time_millis() const +{ +    return last_inference_time_; +} + + +PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&session)  +    : model_version_{session.GetModelMetadata().GetVersion()} +    , session_{std::move(session)} +    , allocator_{session_, allocator_info} +{ +    using namespace std::literals::string_literals; + +    if (session_.GetOutputCount() < 2) +        throw std::runtime_error("Invalid Model: must have at least two outputs"); + +    // WARNING UB .. but still ... +    // If the model was saved without meta data, it seems the version field is uninitialized. +    // In that case reading from it is UB. However, in practice we will just get some arbitrary number +    // which is hopefully different from the numbers used by models where the version is set. +    if (model_version_ != 2 && model_version_ != 3) +        model_version_ = 1; + +    const cv::Size input_image_shape = get_input_image_shape(session_); + +    scaled_frame_ = cv::Mat(input_image_shape, CV_8U, cv::Scalar(0)); +    input_mat_ = cv::Mat(input_image_shape, CV_32F, cv::Scalar(0.f)); + +    { +        const std::int64_t input_shape[4] = { 1, 1, input_image_shape.height, input_image_shape.width }; +        input_val_.push_back( +            Ort::Value::CreateTensor<float>(allocator_info, input_mat_.ptr<float>(0), input_mat_.total(), input_shape, 4)); +    } + +    struct TensorSpec +    { +        std::vector<int64_t> shape; +        float* buffer = nullptr; +        size_t element_count = 0; +        bool available = false; +    }; + +    std::unordered_map<std::string, TensorSpec> understood_outputs = { +        { "pos_size", TensorSpec{ { 1, 3 }, &output_coord_[0], output_coord_.rows } }, +        { "quat", TensorSpec{ { 1, 4},  &output_quat_[0], output_quat_.rows } }, +        { "box", TensorSpec{ { 1, 4}, &output_box_[0], output_box_.rows } }, +        { "rotaxis_scales_tril", TensorSpec{ {1, 3, 3}, output_rotaxis_scales_tril_.val, 9 }}, +        { "rotaxis_std", TensorSpec{ {1, 3, 3}, output_rotaxis_scales_tril_.val, 9 }}, // TODO: Delete when old models aren't used any more +        { "eyes", TensorSpec{ { 1, 2}, output_eyes_.val, output_eyes_.rows }}, +        { "pos_size_std", TensorSpec{ {1, 3}, output_coord_scales_.val, output_coord_scales_.rows}}, +        { "pos_size_scales", TensorSpec{ {1, 3}, output_coord_scales_.val, output_coord_scales_.rows}}, +        //{ "box_std", TensorSpec{ {1, 4}, output_box_scales_.val, output_box_scales_ .rows}} +    }; + +    qDebug() << "Pose model inputs (" << session_.GetInputCount() << ")"; +    qDebug() << "Pose model outputs (" << session_.GetOutputCount() << "):"; +    for (size_t i=0; i<session_.GetOutputCount(); ++i) +    { +        const char* name = session_.GetOutputName(i, allocator_); +        const auto& output_info = session_.GetOutputTypeInfo(i); +        const auto& onnx_tensor_spec = output_info.GetTensorTypeAndShapeInfo(); +        auto my_tensor_spec = understood_outputs.find(name); + +        qDebug() << "\t" << name << " (" << onnx_tensor_spec.GetShape() << ") dtype: " <<  onnx_tensor_spec.GetElementType() << " " << +            (my_tensor_spec != understood_outputs.end() ? "ok" : "unknown"); + +        if (my_tensor_spec != understood_outputs.end()) +        { +            TensorSpec& t = my_tensor_spec->second; +            if (onnx_tensor_spec.GetShape() != t.shape ||  +                onnx_tensor_spec.GetElementType() != Ort::TypeToTensorType<float>::type) +                throw std::runtime_error("Invalid output tensor spec for "s + name); +            output_val_.push_back(Ort::Value::CreateTensor<float>( +                allocator_info, t.buffer, t.element_count, t.shape.data(), t.shape.size())); +            t.available = true; +        } +        else +        { +            // Create tensor regardless and ignore output +            output_val_.push_back(create_tensor(output_info, allocator_)); +        } +        output_names_.push_back(name); +    } + +    has_uncertainty_ = understood_outputs.at("rotaxis_scales_tril").available || +                       understood_outputs.at("rotaxis_std").available; +    has_uncertainty_ &= understood_outputs.at("pos_size_std").available || +                        understood_outputs.at("pos_size_scales").available; +    //has_uncertainty_ &= understood_outputs.at("box_std").available; +    has_eye_closed_detection_ = understood_outputs.at("eyes").available; + +    // FIXME: Recurrent states + +    // size_t num_regular_outputs = 2; + +    // num_recurrent_states_ = session_.GetInputCount()-1; +    // if (session_.GetOutputCount()-num_regular_outputs != num_recurrent_states_) +    //     throw std::runtime_error("Invalid Model: After regular inputs and outputs the model must have equal number of inputs and outputs for tensors holding hidden states of recurrent layers."); + +    // // Create tensors for recurrent state +    // for (size_t i = 0; i < num_recurrent_states_; ++i) +    // { +    //     const auto& input_info = session_.GetInputTypeInfo(1+i); +    //     const auto& output_info = session_.GetOutputTypeInfo(num_regular_outputs+i); +    //     if (input_info.GetTensorTypeAndShapeInfo().GetShape() !=  +    //         output_info.GetTensorTypeAndShapeInfo().GetShape()) +    //         throw std::runtime_error("Invalid Model: Tensors for recurrent hidden states should have same shape on intput and output"); +    //     input_val_.push_back(create_tensor(input_info, allocator_)); +    //     output_val_.push_back(create_tensor(output_info, allocator_)); +    // } + +    for (size_t i = 0; i < session_.GetInputCount(); ++i) +    { +        input_names_.push_back(session_.GetInputName(i, allocator_)); +    } + +    assert (input_names_.size() == input_val_.size()); +    assert (output_names_.size() == output_val_.size()); +} + + +std::optional<PoseEstimator::Face> PoseEstimator::run( +    const cv::Mat &frame, const cv::Rect &box) +{ +    cv::Mat cropped; + +    const int patch_size = std::max(box.width, box.height)*1.05; +    const cv::Point2f patch_center = { +        std::clamp<float>(box.x + 0.5f*box.width, 0.f, frame.cols), +        std::clamp<float>(box.y + 0.5f*box.height, 0.f, frame.rows) +    }; +    cv::getRectSubPix(frame, {patch_size, patch_size}, patch_center, cropped); + +    // Will get failure if patch_center is outside image boundaries settings. +    // Have to catch this case. +    if (cropped.rows != patch_size || cropped.cols != patch_size) +        return {}; + +    [[maybe_unused]] auto* p = input_mat_.ptr(0); + +    cv::resize(cropped, scaled_frame_, scaled_frame_.size(), 0, 0, cv::INTER_AREA); + +    normalize_brightness(scaled_frame_, input_mat_); + +    assert (input_mat_.ptr(0) == p); +    assert (!input_mat_.empty() && input_mat_.isContinuous()); + +    Timer t; t.start(); + +    try +    { +        session_.Run( +            Ort::RunOptions{ nullptr },  +            input_names_.data(),  +            input_val_.data(),  +            input_val_.size(),  +            output_names_.data(),  +            output_val_.data(),  +            output_val_.size()); +    } +    catch (const Ort::Exception &e) +    { +        qDebug() << "Failed to run the model: " << e.what(); +        return {}; +    } + +    for (size_t i = 0; i<num_recurrent_states_; ++i) +    { +        // Next step, the current output becomes the input. +        // Thus we realize the recurrent connection. +        // Only swaps the internal pointers. There is no copy of data. +        std::swap( +            output_val_[output_val_.size()-num_recurrent_states_+i], +            input_val_[input_val_.size()-num_recurrent_states_+i]); +    } + +    // FIXME: Execution time fluctuates wildly. 19 to 26 msec. Why? +    //        The instructions are always the same. Maybe a memory allocation +    //        issue. The ONNX api suggests that tensor are allocated in an +    //        arena. Does that matter? Maybe the issue is something else? + +    last_inference_time_ = t.elapsed_ms(); + +    // Perform coordinate transformation. +    // From patch-local normalized in [-1,1] to +    // frame unnormalized pixel coordinatesettings. + +    const cv::Point2f center = patch_center +  +        (0.5f*patch_size)*cv::Point2f{output_coord_[0], output_coord_[1]}; + +    cv::Point2f center_stddev = { +        (0.5f*patch_size)*output_coord_scales_[0], +        (0.5f*patch_size)*output_coord_scales_[1] }; + +    const float size = patch_size*0.5f*output_coord_[2]; + +    float size_stddev = patch_size*0.5f*output_coord_scales_[2]; + +    // Following Eigen which uses quat components in the order w, x, y, z. +    // As does OpenCV +    cv::Quatf rotation = {  +        output_quat_[3],  +        output_quat_[0],  +        output_quat_[1],  +        output_quat_[2] }; + +    // Should be lower triangular. If not maybe something is wrong with memory layout ... or the model. +    assert(output_rotaxis_scales_tril_(0, 1) == 0); +    assert(output_rotaxis_scales_tril_(0, 2) == 0); +    assert(output_rotaxis_scales_tril_(1, 2) == 0); + +    cv::Matx33f rotaxis_scales_tril = output_rotaxis_scales_tril_; +     +    if (model_version_ < 2) +    { +        // Due to a change in coordinate conventions +        rotation = world_to_image(rotation); +    } + +    const cv::Rect2f outbox = { +        patch_center.x + (0.5f*patch_size)*output_box_[0], +        patch_center.y + (0.5f*patch_size)*output_box_[1], +        0.5f*patch_size*(output_box_[2]-output_box_[0]), +        0.5f*patch_size*(output_box_[3]-output_box_[1]) +    }; +    // const RoiCorners outbox = { +    //     patch_center + 0.5f*patch_size*cv::Point2f{output_box_[0], output_box_[1]}, +    //     patch_center + 0.5f*patch_size*cv::Point2f{output_box_[2], output_box_[3]} +    // }; +    // RoiCorners outbox_stddev = { +    //     0.5f*patch_size*cv::Point2f{output_box_scales_[0], output_box_scales_[1]}, +    //     0.5f*patch_size*cv::Point2f{output_box_scales_[2], output_box_scales_[3]} +    // }; + +    // Because the model is sensitive to closing eyes we increase the uncertainty  +    // a lot to make the subsequent filtering smooth the output more. This should suppress +    // "twitching" when the user blinks. +    if (has_eye_closed_detection_) +    { +        const float eye_open = std::min(output_eyes_[0], output_eyes_[1]); +        const float increase_factor = 1.f + 10.f * std::pow(1. - eye_open,4.f); +        rotaxis_scales_tril *= increase_factor; +        size_stddev *= increase_factor; +        center_stddev *= increase_factor; +    } + +    return std::optional<Face>({ +        rotation, rotaxis_scales_tril, outbox, center, center_stddev, size, size_stddev +    }); +} + + +cv::Mat PoseEstimator::last_network_input() const +{ +    assert(!input_mat_.empty()); +    cv::Mat ret; +    input_mat_.convertTo(ret, CV_8U, 255., 127.); +    cv::cvtColor(ret, ret, cv::COLOR_GRAY2RGB); +    return ret; +} + + +double PoseEstimator::last_inference_time_millis() const +{ +    return last_inference_time_; +} + + + + + +} // namespace neuralnet_tracker_ns
\ No newline at end of file diff --git a/tracker-neuralnet/model_adapters.h b/tracker-neuralnet/model_adapters.h new file mode 100644 index 00000000..3fbfb861 --- /dev/null +++ b/tracker-neuralnet/model_adapters.h @@ -0,0 +1,102 @@ +#pragma once + +#include <optional> +#include <array> +#include <vector> + +#include <onnxruntime_cxx_api.h> +#include <opencv2/core.hpp> +#include "opencv_contrib.h" + + +namespace neuralnet_tracker_ns +{ + +// Generally useful sigmoid function +float sigmoid(float x); + + +class Localizer +{ +    public: +        Localizer(Ort::MemoryInfo &allocator_info, +                    Ort::Session &&session); +         +        // Returns bounding wrt image coordinate of the input image +        // The preceeding float is the score for being a face normalized to [0,1]. +        std::pair<float, cv::Rect2f> run( +            const cv::Mat &frame); + +        double last_inference_time_millis() const; +    private: +        inline static constexpr int INPUT_IMG_WIDTH = 288; +        inline static constexpr int INPUT_IMG_HEIGHT = 224; +        Ort::Session session_{nullptr}; +        // Inputs / outputs +        cv::Mat scaled_frame_{}, input_mat_{}; +        Ort::Value input_val_{nullptr}, output_val_{nullptr}; +        std::array<float, 5> results_; +        double last_inference_time_ = 0; +}; + + +class PoseEstimator +{ +    public: +        struct Face +        { +            cv::Quatf rotation; +            cv::Matx33f rotaxis_cov_tril; // Lower triangular factor of Cholesky decomposition +            cv::Rect2f box; +            cv::Point2f center; +            cv::Point2f center_stddev; +            float size; +            float size_stddev; +        }; + +        PoseEstimator(Ort::MemoryInfo &allocator_info, +                        Ort::Session &&session); +        /** Inference +        * +        * Coordinates are defined wrt. the image space of the input `frame`. +        * X goes right, Z (depth) into the image, Y points down (like pixel coordinates values increase from top to bottom) +        */ +        std::optional<Face> run(const cv::Mat &frame, const cv::Rect &box); +        // Returns an image compatible with the 'frame' image for displaying. +        cv::Mat last_network_input() const; +        double last_inference_time_millis() const; +        bool has_uncertainty() const { return has_uncertainty_; } + +    private: +        int64_t model_version_ = 0;  // Queried meta data from the ONNX file +        Ort::Session session_{nullptr};  // ONNX's runtime context for running the model +        Ort::Allocator allocator_;   // Memory allocator for tensors +        // Inputs +        cv::Mat scaled_frame_{}, input_mat_{};  // Input. One is the original crop, the other is rescaled (?) +        std::vector<Ort::Value> input_val_;    // Tensors to put into the model +        std::vector<const char*> input_names_; // Refers to the names in the onnx model.  +        // Outputs +        cv::Vec<float, 3> output_coord_{};  // 2d Coordinate and head size output. +        cv::Vec<float, 4> output_quat_{};   //  Quaternion output +        cv::Vec<float, 4> output_box_{};    // Bounding box output +        cv::Matx33f output_rotaxis_scales_tril_{}; // Lower triangular matrix of LLT factorization of covariance of rotation vector as offset from output quaternion +        cv::Vec<float, 2> output_eyes_{}; +        cv::Vec<float, 3> output_coord_scales_{}; +        std::vector<Ort::Value> output_val_; // Tensors to put the model outputs in. +        std::vector<const char*> output_names_; // Refers to the names in the onnx model. +        // More bookkeeping +        size_t num_recurrent_states_ = 0; +        double last_inference_time_ = 0; +        bool has_uncertainty_ = false; +        bool has_eye_closed_detection_ = false; +}; + + +// Finds the intensity where x percent of pixels have less intensity than that. +int find_input_intensity_quantile(const cv::Mat& frame, float percentage); + +// Adjust brightness levels to full range and scales the value range to [-0.5, 0.5] +void normalize_brightness(const cv::Mat& frame, cv::Mat& out); + + +} // namespace neuralnet_tracker_ns
\ No newline at end of file diff --git a/tracker-neuralnet/opencv_contrib.h b/tracker-neuralnet/opencv_contrib.h new file mode 100644 index 00000000..af92c12f --- /dev/null +++ b/tracker-neuralnet/opencv_contrib.h @@ -0,0 +1,120 @@ +#pragma once + +#include <opencv2/core.hpp> +#include <opencv2/core/base.hpp> +#include <opencv2/core/quaternion.hpp> + +// Well eventually it might be a contribution + +namespace cvcontrib +{ + + +template<class T> +cv::Point_<T> as_point(const cv::Size_<T>& s) +{ +    return { s.width, s.height }; +} + + +template<class T> +cv::Size_<T> as_size(const cv::Point_<T>& p) +{ +    return { p.x, p.y }; +} + + +template<int n, int m> +inline bool allfinite(const cv::Matx<float, n, m> &mat) +{ +    const size_t sz = mat.rows*mat.cols; +    for (size_t i=0; i<sz; ++i) +        if (!std::isfinite(mat.val[i])) +            return false; +    return true; +} + + +// Because compiler refuses to convert it automatically +template<int n> +inline cv::Vec<float, n> to_vec(const cv::Matx<float, n, 1>& m) +{ +    return cv::Vec<float,n>{m.val}; +} + + +template<int n, int m, int o> +inline void set_minor(cv::Vec<float, m> &dst, const int startrow, const cv::Matx<float, o, 1> &src) +{ +    assert (startrow>=0 && startrow+n <= dst.rows); +    for (int row=startrow, i=0; row<startrow+n; ++row,++i) +    { +        dst[row] = src(i,0); +    } +} + + +template<int nrows, int ncols, int m, int n> +inline void set_minor(cv::Matx<float, m, n>& dst, const int startrow, int startcol, const cv::Matx<float, nrows, ncols> &src) +{ +    assert (startrow>=0 && startrow+nrows <= dst.rows); +    assert (startcol>=0 && startcol+ncols <= dst.cols); +    for (int row=startrow, i=0; row<startrow+nrows; ++row,++i) +    { +        for (int col=startcol, j=0; col<startcol+ncols; ++col,++j) +        { +            dst(row, col) = src(i,j); +        } +    } +} + + +inline cv::Quatf identity_quat() +{ +    return cv::Quatf(1,0,0,0); +} + + +inline cv::Vec3f toRotVec(const cv::Quatf& q) +{ +    // This is an improved implementation +#if 1 +    // w = cos(alpha/2) +    // xyz = sin(alpha/2)*axis +    static constexpr float eps = 1.e-12; +    const cv::Vec3f xyz{q.x, q.y, q.z}; +    const float len = cv::norm(xyz); +    const float angle = std::atan2(len, q.w)*2.f; +    return xyz*(angle/(len+eps)); +#else +    // The opencv implementation fails even the simplest test: +    // out = toRVec(cv::Quatf{1., 0., 0., 0. }); +    // ASSERT_TRUE(std::isfinite(out[0]) && std::isfinite(out[1]) && std::isfinite(out[2]));     +    return q.toRotVec(); +#endif +} + + +inline cv::Vec3f rotate(const cv::Quatf& q, const cv::Vec3f &v) +{ +    const auto r = q * cv::Quatf{0., v[0], v[1], v[2]} * q.conjugate(); +    return { r.x, r.y, r.z }; +} + + +template<int n> +inline cv::Matx<float, n, n> cholesky(const cv::Matx<float, n, n>& mat) +{ +    cv::Matx<float, n, n> l = mat; +    // Der Code ist die Doku!  +    // https://github.com/opencv/opencv/blob/4.5.4/modules/core/src/matrix_decomp.cpp#L95 +    cv::Cholesky(l.val, l.cols * sizeof(float), n, nullptr, 0, 0); +    // It doesn't clear the upper triangle so we do it for it. +    for (int row=0; row<n; ++row) +        for (int col=row+1; col<n; ++col) +            l(row, col) = 0.f; +    return l; +} + + +} // namespace cvcontrib
\ No newline at end of file diff --git a/tracker-neuralnet/preview.cpp b/tracker-neuralnet/preview.cpp new file mode 100644 index 00000000..76a6bbc0 --- /dev/null +++ b/tracker-neuralnet/preview.cpp @@ -0,0 +1,135 @@ +#include "preview.h" + + +namespace neuralnet_tracker_ns +{ + + +cv::Rect make_crop_rect_for_aspect(const cv::Size &size, int aspect_w, int aspect_h) +{ +    auto [w, h] = size; +    if ( w*aspect_h > aspect_w*h ) +    { +        // Image is too wide +        const int new_w = (aspect_w*h)/aspect_h; +        return cv::Rect((w - new_w)/2, 0, new_w, h); +    } +    else +    { +        const int new_h = (aspect_h*w)/aspect_w; +        return cv::Rect(0, (h - new_h)/2, w, new_h); +    } +} + + + + +void Preview::init(const cv_video_widget& widget) +{ +    auto [w,h] = widget.preview_size(); +    preview_size_ = { w, h }; +} + + +void Preview::copy_video_frame(const cv::Mat& frame) +{ +    cv::Rect roi = make_crop_rect_for_aspect(frame.size(), preview_size_.width, preview_size_.height); +     +    cv::resize(frame(roi), preview_image_, preview_size_, 0, 0, cv::INTER_NEAREST); + +    offset_ = { (float)-roi.x, (float)-roi.y }; +    scale_ = float(preview_image_.cols) / float(roi.width); +} + + +void Preview::draw_gizmos( +    const std::optional<PoseEstimator::Face> &face, +    const std::optional<cv::Rect2f>& last_roi, +    const std::optional<cv::Rect2f>& last_localizer_roi, +    const cv::Point2f& neckjoint_position) +{ +    if (preview_image_.empty()) +        return; + +    if (last_roi)  +    { +        const int col = 255; +        cv::rectangle(preview_image_, transform(*last_roi), cv::Scalar(0, col, 0), /*thickness=*/1); +    } +    if (last_localizer_roi) +    { +        const int col = 255; +        cv::rectangle(preview_image_, transform(*last_localizer_roi), cv::Scalar(col, 0, 255-col), /*thickness=*/1); +    } + +    if (face) +    { +        if (face->size>=1.f) +            cv::circle(preview_image_, static_cast<cv::Point>(transform(face->center)), int(transform(face->size)), cv::Scalar(255,255,255), 2); +        cv::circle(preview_image_, static_cast<cv::Point>(transform(face->center)), 3, cv::Scalar(255,255,255), -1); + +        const cv::Matx33f R = face->rotation.toRotMat3x3(cv::QUAT_ASSUME_UNIT); + +        auto draw_coord_line = [&](int i, const cv::Scalar& color) +        { +            const float vx = R(0,i); +            const float vy = R(1,i); +            static constexpr float len = 100.f; +            cv::Point q = face->center + len*cv::Point2f{vx, vy}; +            cv::line(preview_image_, static_cast<cv::Point>(transform(face->center)), static_cast<cv::Point>(transform(q)), color, 2); +        }; +        draw_coord_line(0, {0, 0, 255}); +        draw_coord_line(1, {0, 255, 0}); +        draw_coord_line(2, {255, 0, 0}); + +            // Draw the computed joint position +        auto xy = transform(neckjoint_position); +        cv::circle(preview_image_, cv::Point(xy.x,xy.y), 5, cv::Scalar(0,0,255), -1); +    } + + +} + +void Preview::overlay_netinput(const cv::Mat& netinput) +{ +    if (netinput.empty()) +        return; + +    const int w = std::min(netinput.cols, preview_image_.cols); +    const int h = std::min(netinput.rows, preview_image_.rows); +    cv::Rect roi(0, 0, w, h); +    netinput(roi).copyTo(preview_image_(roi)); +} + +void Preview::draw_fps(double fps, double last_inference_time) +{ +    char buf[128]; +    ::snprintf(buf, sizeof(buf), "%d Hz, pose inference: %d ms", std::clamp(int(fps), 0, 9999), int(last_inference_time)); +    cv::putText(preview_image_, buf, cv::Point(10, preview_image_.rows-10), cv::FONT_HERSHEY_PLAIN, 1, cv::Scalar(0, 255, 0), 1); +} + + +void Preview::copy_to_widget(cv_video_widget& widget) +{ +    if (preview_image_.rows > 0) +        widget.update_image(preview_image_); +} + + +cv::Rect2f Preview::transform(const cv::Rect2f& r) const +{ +    return { (r.x - offset_.x)*scale_, (r.y - offset_.y)*scale_, r.width*scale_, r.height*scale_ }; +} + +cv::Point2f Preview::transform(const cv::Point2f& p) const +{ +    return { (p.x - offset_.x)*scale_ , (p.y - offset_.y)*scale_ }; +} + +float Preview::transform(float s) const +{ +    return s * scale_; +} + + +}
\ No newline at end of file diff --git a/tracker-neuralnet/preview.h b/tracker-neuralnet/preview.h new file mode 100644 index 00000000..adc12993 --- /dev/null +++ b/tracker-neuralnet/preview.h @@ -0,0 +1,60 @@ +/* Copyright (c) 2021 Michael Welter <michael@welter-4d.de> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + */ + +#pragma once + +#include "model_adapters.h" + +#include "cv/video-widget.hpp" + +#include <optional> + +#include <opencv2/core.hpp> +#include <opencv2/imgproc.hpp> + + +namespace neuralnet_tracker_ns +{ + +/** Makes a maximum size cropping rect with the given aspect.  +*   @param aspect_w: nominator of the aspect ratio +*   @param aspect_h: denom of the aspect ratio +*/ +cv::Rect make_crop_rect_for_aspect(const cv::Size &size, int aspect_w, int aspect_h); + + +/** This class is responsible for drawing the debug/info gizmos +*  +* In addition there function to transform the inputs to the size of +* the preview image which can be different from the camera frame. +*/ +class Preview +{ +public: +    void init(const cv_video_widget& widget); +    void copy_video_frame(const cv::Mat& frame); +    void draw_gizmos( +        const std::optional<PoseEstimator::Face> &face, +        const std::optional<cv::Rect2f>& last_roi, +        const std::optional<cv::Rect2f>& last_localizer_roi, +        const cv::Point2f& neckjoint_position); +    void overlay_netinput(const cv::Mat& netinput); +    void draw_fps(double fps, double last_inference_time); +    void copy_to_widget(cv_video_widget& widget); +private: +    // Transform from camera image to preview +    cv::Rect2f transform(const cv::Rect2f& r) const; +    cv::Point2f transform(const cv::Point2f& p) const; +    float transform(float s) const; + +    cv::Mat preview_image_; +    cv::Size preview_size_ = { 0, 0 }; +    float scale_ = 1.f;   +    cv::Point2f offset_ = { 0.f, 0.f}; +}; + +} // neuralnet_tracker_ns
\ No newline at end of file diff --git a/tracker-neuralnet/tests.cpp b/tracker-neuralnet/tests.cpp new file mode 100644 index 00000000..b1d2a6d0 --- /dev/null +++ b/tracker-neuralnet/tests.cpp @@ -0,0 +1,58 @@ +#include "model_adapters.h" + +#include <algorithm> +#include <numeric> +#include <cstdio> + +namespace neuralnet_tracker_tests +{ + + +void assert_(bool ok, const std::string& msg) +{ +    if (ok) +        return; +    std::cout << msg << std::endl; +    std::exit(-1); +} + + +void test_find_input_intensity_quantile() +{ +    cv::Mat data(10,10, CV_8UC1); +    std::iota(data.begin<uint8_t>(), data.end<uint8_t>(), 0); + +    const float pct = 90; + +    const int val = neuralnet_tracker_ns::find_input_intensity_quantile(data, pct); +     +    assert_(val == int(10*10*pct/100.f), "test_find_input_intensity_quantile failed"); +} + + +void test_normalize_brightness() +{ +    cv::Mat data(10,10, CV_8UC1); +    std::iota(data.begin<uint8_t>(), data.end<uint8_t>(), 0); + +    cv::Mat out; +    neuralnet_tracker_ns::normalize_brightness(data, out); + +    auto [minit,maxit] = std::minmax_element(out.begin<float>(),out.end<float>()); +    const auto minval = *minit; +    const auto maxval = *maxit; +    assert_(std::abs(minval + 0.5f) < 0.02, "test_normalize_brightness failed"); +    // If the brightest value is lower than half-max, it will be boosted to half-max. +    // Otherwise it will just be rescaled to [-.5, 0.5 ]. Here we have the low-brightness case. +    assert_(std::abs(maxval - 0.0f) < 0.02, "test_normalize_brightness failed"); +} + + +void run() +{ +    test_find_input_intensity_quantile(); +    test_normalize_brightness(); +} + + +}
\ No newline at end of file diff --git a/tracker-neuralnet/unscented_trafo.h b/tracker-neuralnet/unscented_trafo.h new file mode 100644 index 00000000..267aa969 --- /dev/null +++ b/tracker-neuralnet/unscented_trafo.h @@ -0,0 +1,132 @@ +#pragma once + +#include <algorithm> +#include <opencv2/core.hpp> +#include <opencv2/core/base.hpp> +#include <opencv2/core/quaternion.hpp> + +#include <cmath> +#include <vector> + +#include "opencv_contrib.h" + +namespace ukf_cv +{ + +using namespace cvcontrib; + +template<int dim, int otherdim = dim> +using SigmaPoints = std::array<cv::Vec<float,otherdim>,dim*2+1>; + + +// Ported from  +// https://filterpy.readthedocs.io/en/latest/_modules/filterpy/kalman/sigma_points.html +// Excerpt from the original docu: +// " + +// Generates sigma points and weights according to Van der Merwe's +// 2004 dissertation[1] for the UnscentedKalmanFilter class.. It +// parametizes the sigma points using alpha, beta, kappa terms, and +// is the version seen in most publications. + +// Unless you know better, this should be your default choice. + +// alpha : float +//     Determins the spread of the sigma points around the mean. +//     Usually a small positive value (1e-3) according to [3]. + +// beta : float +//     Incorporates prior knowledge of the distribution of the mean. For +//     Gaussian x beta=2 is optimal, according to [3]. + +// kappa : float, default=0.0 +//     Secondary scaling parameter usually set to 0 according to [4], +//     or to 3-n according to [5]. + +// Reference +// .. [1] R. Van der Merwe "Sigma-Point Kalman Filters for Probabilitic +//        Inference in Dynamic State-Space Models" (Doctoral dissertation) + +// " +template<int dim> +class MerweScaledSigmaPoints +{ +public: +    static constexpr int num_sigmas = 2*dim+1; +     +    using Vector = cv::Vec<float,dim>; +    using Matrix = cv::Matx<float,dim,dim>; + +    MerweScaledSigmaPoints(float alpha = 0.01, float beta = 2., int kappa = 3-dim) +    { +        lambda = alpha*alpha * (dim + kappa) - dim; +        const float c = .5 / (dim + lambda); +        Wc_i = c; +        Wm_i = c; +        Wm_0 = lambda / (dim+lambda); +        Wc_0 = Wm_0 + (1.-alpha*alpha + beta); +    } + +    SigmaPoints<dim> compute_sigmas(const Vector &mu, const Matrix &mat, bool is_tril_factor) const +    { +        const Matrix triu_factor = is_tril_factor ? mat.t() : cholesky(mat).t(); + +        const Matrix U = triu_factor*std::sqrt(lambda+dim); + +        SigmaPoints<dim> sigmas; + +        sigmas[0] = mu; +        for (int k=0; k<dim; ++k) +        { +            sigmas[k+1] = to_vec(mu + U.row(k).t()); +            sigmas[dim+k+1] = to_vec(mu - U.row(k).t()); +        } +        return sigmas; +    } + +    template<int otherdim> +    std::tuple<cv::Vec<float,otherdim> , cv::Matx<float,otherdim,otherdim>> compute_statistics(const SigmaPoints<dim,otherdim> &sigmas) const +    { +        cv::Vec<float,otherdim> mu{}; // Zero initializes +        for (size_t i=0; i<sigmas.size(); ++i) +        { +            mu += to_vec((i==0 ? Wm_0 : Wm_i) * sigmas[i]); +        } + +        cv::Matx<float,otherdim,otherdim> cov{}; +        for (size_t i=0; i<sigmas.size(); ++i) +        { +            const auto p = sigmas[i] - mu; +            cov += (i==0 ? Wc_0 : Wc_i)*p*p.t(); +        } + +        return { mu, cov }; +    } + +    template<int otherdim> +    cv::Matx<float,dim,otherdim> compute_cov(const SigmaPoints<dim,dim> &sigmas, const SigmaPoints<dim,otherdim> &othersigmas) const +    { +        cv::Vec<float,dim> mu{}; // Zero initializes +        cv::Vec<float,otherdim> mu_other{}; // Zero initializes +        for (size_t i=0; i<sigmas.size(); ++i) +        { +            mu += to_vec((i==0 ? Wm_0 : Wm_i) * sigmas[i]); +            mu_other += to_vec((i==0 ? Wm_0 : Wm_i) * othersigmas[i]); +        } + +        cv::Matx<float,dim,otherdim> cov{}; +        for (size_t i=0; i<sigmas.size(); ++i) +        { +            const auto p = sigmas[i] - mu; +            const auto q = othersigmas[i] - mu_other; +            cov += (i==0 ? Wc_0 : Wc_i)*p*q.t(); +        } + +        return cov; +    } +private: +    float Wc_i, Wm_i, Wm_0, Wc_0, lambda; +}; + + +} // namespace ukf_cv
\ No newline at end of file | 
