diff options
Diffstat (limited to 'tracker-neuralnet/ftnoir_tracker_neuralnet.cpp')
-rw-r--r-- | tracker-neuralnet/ftnoir_tracker_neuralnet.cpp | 822 |
1 files changed, 178 insertions, 644 deletions
diff --git a/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp b/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp index 62209978..b56b4e96 100644 --- a/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp +++ b/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp @@ -6,17 +6,19 @@ */ #include "ftnoir_tracker_neuralnet.h" +#include "deadzone_filter.h" +#include "opencv_contrib.h" + #include "compat/sleep.hpp" #include "compat/math-imports.hpp" -#include "cv/init.hpp" -#include <opencv2/core.hpp> -#include <opencv2/core/hal/interface.h> -#include <opencv2/core/types.hpp> -#include <opencv2/calib3d.hpp> -#include <opencv2/imgcodecs.hpp> #include "compat/timer.hpp" #include "compat/check-visible.hpp" +#include "cv/init.hpp" + #include <omp.h> +#include <onnxruntime_cxx_api.h> +#include <opencv2/core.hpp> +#include <opencv2/core/quaternion.hpp> #ifdef _MSC_VER # pragma warning(disable : 4702) @@ -32,7 +34,7 @@ #include <chrono> #include <string> #include <stdexcept> - +#include <unordered_map> // Some demo code for onnx // https://github.com/microsoft/onnxruntime/blob/master/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/C_Api_Sample.cpp @@ -41,12 +43,11 @@ namespace neuralnet_tracker_ns { +using namespace cvcontrib; using numeric_types::vec3; using numeric_types::vec2; using numeric_types::mat33; -using quat = std::array<numeric_types::f,4>; - #if _MSC_VER std::wstring convert(const QString &s) { return s.toStdWString(); } @@ -55,6 +56,18 @@ std::string convert(const QString &s) { return s.toStdString(); } #endif +int enum_to_fps(int value) +{ + switch (value) + { + case fps_30: return 30; + case fps_60: return 60; + default: [[fallthrough]]; + case fps_default: return 0; + } +} + + template<class F> struct OnScopeExit { @@ -67,28 +80,38 @@ struct OnScopeExit }; -float sigmoid(float x) +CamIntrinsics make_intrinsics(const cv::Mat& img, const Settings& settings) { - return 1.f/(1.f + std::exp(-x)); -} - + const int w = img.cols, h = img.rows; + const double diag_fov = settings.fov * M_PI / 180.; + const double fov_w = 2.*atan(tan(diag_fov/2.)/sqrt(1. + h/(double)w * h/(double)w)); + const double fov_h = 2.*atan(tan(diag_fov/2.)/sqrt(1. + w/(double)h * w/(double)h)); + const double focal_length_w = 1. / tan(.5 * fov_w); + const double focal_length_h = 1. / tan(.5 * fov_h); + /* a + ______ <--- here is sensor area + | / + | / + f | / + | / 2 x angle is the fov + |/ + <--- here is the hole of the pinhole camera + + So, a / f = tan(fov / 2) + => f = a/tan(fov/2) + What is a? + 1 if we define f in terms of clip space where the image plane goes from -1 to 1. Because a is the half-width. + */ -cv::Rect make_crop_rect_for_aspect(const cv::Size &size, int aspect_w, int aspect_h) -{ - auto [w, h] = size; - if ( w*aspect_h > aspect_w*h ) - { - // Image is too wide - const int new_w = (aspect_w*h)/aspect_h; - return cv::Rect((w - new_w)/2, 0, new_w, h); - } - else - { - const int new_h = (aspect_h*w)/aspect_w; - return cv::Rect(0, (h - new_h)/2, w, new_h); - } + return { + (float)focal_length_w, + (float)focal_length_h, + (float)fov_w, + (float)fov_h + }; } + cv::Rect make_crop_rect_multiple_of(const cv::Size &size, int multiple) { const int new_w = (size.width / multiple) * multiple; @@ -111,20 +134,6 @@ cv::Rect_<T> squarize(const cv::Rect_<T> &r) template<class T> -cv::Point_<T> as_point(const cv::Size_<T>& s) -{ - return { s.width, s.height }; -} - - -template<class T> -cv::Size_<T> as_size(const cv::Point_<T>& p) -{ - return { p.x, p.y }; -} - - -template<class T> cv::Rect_<T> expand(const cv::Rect_<T>& r, T factor) { // xnew = l+.5*w - w*f*0.5 = l + .5*(w - new_w) @@ -145,110 +154,40 @@ cv::Rect_<T> ewa_filter(const cv::Rect_<T>& last, const cv::Rect_<T>& current, T } -cv::Rect2f unnormalize(const cv::Rect2f &r, int h, int w) -{ - auto unnorm = [](float x) -> float { return 0.5*(x+1); }; - auto tl = r.tl(); - auto br = r.br(); - auto x0 = unnorm(tl.x)*w; - auto y0 = unnorm(tl.y)*h; - auto x1 = unnorm(br.x)*w; - auto y1 = unnorm(br.y)*h; - return { - x0, y0, x1-x0, y1-y0 - }; -} - -cv::Point2f normalize(const cv::Point2f &p, int h, int w) -{ - return { - p.x/w*2.f-1.f, - p.y/h*2.f-1.f - }; -} - - -mat33 rotation_from_two_vectors(const vec3 &a, const vec3 &b) -{ - vec3 axis = a.cross(b); - const float len_a = cv::norm(a); - const float len_b = cv::norm(b); - const float len_axis = cv::norm(axis); - const float sin_angle = std::clamp(len_axis / (len_a * len_b), -1.f, 1.f); - const float angle = std::asin(sin_angle); - axis *= angle/(1.e-12 + len_axis); - mat33 out; - cv::Rodrigues(axis, out); - return out; -} - - -// Computes correction due to head being off screen center. -// x, y: In screen space, i.e. in [-1,1] -// focal_length_x: In screen space -mat33 compute_rotation_correction(const cv::Point2f &p, float focal_length_x) -{ - return rotation_from_two_vectors( - {1.f,0.f,0.f}, - {focal_length_x, p.y, p.x}); -} - - -mat33 quaternion_to_mat33(const std::array<float,4> quat) -{ - mat33 m; - const float w = quat[0]; - const float i = quat[1]; - const float j = quat[2]; - const float k = quat[3]; - m(0,0) = 1.f - 2.f*(j*j + k*k); - m(1,0) = 2.f*(i*j + k*w); - m(2,0) = 2.f*(i*k - j*w); - m(0,1) = 2.f*(i*j - k*w); - m(1,1) = 1.f - 2.f*(i*i + k*k); - m(2,1) = 2.f*(j*k + i*w); - m(0,2) = 2.f*(i*k + j*w); - m(1,2) = 2.f*(j*k - i*w); - m(2,2) = 1.f - 2.f*(i*i + j*j); - return m; -} - - -vec3 rotate_vec(const quat& q, const vec3& p) +cv::Vec3f image_to_world(float x, float y, float size, float reference_size_in_mm, const cv::Size2i& image_size, const CamIntrinsics& intrinsics) { - const float qw = q[0]; - const float qi = q[1]; - const float qj = q[2]; - const float qk = q[3]; - const float pi = p[0]; - const float pj = p[1]; - const float pk = p[2]; - const quat tmp{ - - qi*pi - qj*pj - qk*pk, - qw*pi + qj*pk - qk*pj, - qw*pj - qi*pk + qk*pi, - qw*pk + qi*pj - qj*pi - }; - const vec3 out { - -tmp[0]*qi + tmp[1]*qw - tmp[2]*qk + tmp[3]*qj, - -tmp[0]*qj + tmp[1]*qk + tmp[2]*qw - tmp[3]*qi, - -tmp[0]*qk - tmp[1]*qj + tmp[2]*qi + tmp[3]*qw - }; - return out; -} - - -vec3 image_to_world(float x, float y, float size, float reference_size_in_mm, const cv::Size2i& image_size, const CamIntrinsics& intrinsics) -{ - // Compute the location the network outputs in 3d space. - const float xpos = -(intrinsics.focal_length_w * image_size.width * 0.5f) / size * reference_size_in_mm; + /* + Compute the location the network outputs in 3d space. + + hhhhhh <- head size (meters) + \ | ----------------------- + \ | \ + \ | | + \ | |- x (meters) + ____ <- face.size / width | + \ | | | + \| |- focal length / + ------------------------ + ------------------------------------------------>> z direction + z/x = zi / f + zi = image position + z = world position + f = focal length + + We can also do deltas: + dz / x = dzi / f + => x = dz / dzi * f + which means we can compute x from the head size (dzi) if we assume some reference size (dz). + */ + const float head_size_vertical = 2.f*size; // Size from the model is more like half the real vertical size of a human head. + const float xpos = -(intrinsics.focal_length_w * image_size.width * 0.5f) / head_size_vertical * reference_size_in_mm; const float zpos = (x / image_size.width * 2.f - 1.f) * xpos / intrinsics.focal_length_w; const float ypos = (y / image_size.height * 2.f - 1.f) * xpos / intrinsics.focal_length_h; return {xpos, ypos, zpos}; } -vec2 world_to_image(const vec3& pos, const cv::Size2i& image_size, const CamIntrinsics& intrinsics) +vec2 world_to_image(const cv::Vec3f& pos, const cv::Size2i& image_size, const CamIntrinsics& intrinsics) { const float xscr = pos[2] / pos[0] * intrinsics.focal_length_w; const float yscr = pos[1] / pos[0] * intrinsics.focal_length_h; @@ -258,7 +197,7 @@ vec2 world_to_image(const vec3& pos, const cv::Size2i& image_size, const CamIntr } -quat image_to_world(quat q) +cv::Quatf image_to_world(cv::Quatf q) { std::swap(q[1], q[3]); q[1] = -q[1]; @@ -267,338 +206,73 @@ quat image_to_world(quat q) return q; } -quat world_to_image(quat q) -{ - // It's its own inverse. - return image_to_world(q); -} - -// Intersection over union. A value between 0 and 1 which measures the match between the bounding boxes. -template<class T> -T iou(const cv::Rect_<T> &a, const cv::Rect_<T> &b) -{ - auto i = a & b; - return double{i.area()} / (a.area()+b.area()-i.area()); -} - -// Returns width and height of the input tensor, or throws. -// Expects the model to take one tensor as input that must -// have the shape B x C x H x W, where B=C=1. -cv::Size get_input_image_shape(const Ort::Session &session) -{ - if (session.GetInputCount() < 1) - throw std::invalid_argument("Model must take at least one input tensor"); - const std::vector<std::int64_t> shape = - session.GetInputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape(); - if (shape.size() != 4) - throw std::invalid_argument("Model takes the input tensor in the wrong shape"); - return { static_cast<int>(shape[3]), static_cast<int>(shape[2]) }; -} - -Ort::Value create_tensor(const Ort::TypeInfo& info, Ort::Allocator& alloc) +cv::Point2f normalize(const cv::Point2f &p, int h, int w) { - const auto shape = info.GetTensorTypeAndShapeInfo().GetShape(); - auto t = Ort::Value::CreateTensor<float>( - alloc, shape.data(), shape.size()); - memset(t.GetTensorMutableData<float>(), 0, sizeof(float)*info.GetTensorTypeAndShapeInfo().GetElementCount()); - return t; + return { + p.x/w*2.f-1.f, + p.y/h*2.f-1.f + }; } - -int enum_to_fps(int value) +cv::Quatf rotation_from_two_vectors(const vec3 &a, const vec3 &b) { - switch (value) + // |axis| = |a| * |b| * sin(alpha) + const vec3 axis = a.cross(b); + // dot = |a|*|b|*cos(alpha) + const float dot = a.dot(b); + const float len = cv::norm(axis); + vec3 normed_axis = axis / len; + float angle = std::atan2(len, dot); + if (!(std::isfinite(normed_axis[0]) && std::isfinite(normed_axis[1]) && std::isfinite(normed_axis[2]))) { - case fps_30: return 30; - case fps_60: return 60; - default: [[fallthrough]]; - case fps_default: return 0; + angle = 0.f; + normed_axis = vec3{1.,0.,0.}; } + return cv::Quatf::createFromAngleAxis(angle, normed_axis); } -Localizer::Localizer(Ort::MemoryInfo &allocator_info, Ort::Session &&session) : - session_{std::move(session)}, - scaled_frame_(INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH, CV_8U), - input_mat_(INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH, CV_32F) -{ - // Only works when input_mat does not reallocated memory ...which it should not. - // Non-owning memory reference to input_mat? - // Note: shape = (bach x channels x h x w) - const std::int64_t input_shape[4] = { 1, 1, INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH }; - input_val_ = Ort::Value::CreateTensor<float>(allocator_info, input_mat_.ptr<float>(0), input_mat_.total(), input_shape, 4); - - const std::int64_t output_shape[2] = { 1, 5 }; - output_val_ = Ort::Value::CreateTensor<float>(allocator_info, results_.data(), results_.size(), output_shape, 2); -} - - -std::pair<float, cv::Rect2f> Localizer::run( - const cv::Mat &frame) -{ - auto p = input_mat_.ptr(0); - - cv::resize(frame, scaled_frame_, { INPUT_IMG_WIDTH, INPUT_IMG_HEIGHT }, 0, 0, cv::INTER_AREA); - scaled_frame_.convertTo(input_mat_, CV_32F, 1./255., -0.5); - - assert (input_mat_.ptr(0) == p); - assert (!input_mat_.empty() && input_mat_.isContinuous()); - assert (input_mat_.cols == INPUT_IMG_WIDTH && input_mat_.rows == INPUT_IMG_HEIGHT); - - const char* input_names[] = {"x"}; - const char* output_names[] = {"logit_box"}; - - Timer t; t.start(); - - session_.Run(Ort::RunOptions{nullptr}, input_names, &input_val_, 1, output_names, &output_val_, 1); - - last_inference_time_ = t.elapsed_ms(); - - const cv::Rect2f roi = unnormalize(cv::Rect2f{ - results_[1], - results_[2], - results_[3]-results_[1], // Width - results_[4]-results_[2] // Height - }, frame.rows, frame.cols); - const float score = sigmoid(results_[0]); - - return { score, roi }; -} - - -double Localizer::last_inference_time_millis() const +// Computes correction due to head being off screen center. +cv::Quatf compute_rotation_correction(const cv::Point3f& p) { - return last_inference_time_; + return rotation_from_two_vectors( + {-1.f,0.f,0.f}, p); } -PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&_session) - : model_version_{_session.GetModelMetadata().GetVersion()} - , session_{std::move(_session)} - , allocator_{session_, allocator_info} +// Intersection over union. A value between 0 and 1 which measures the match between the bounding boxes. +template<class T> +T iou(const cv::Rect_<T> &a, const cv::Rect_<T> &b) { - using namespace std::literals::string_literals; - - if (session_.GetOutputCount() < 2) - throw std::runtime_error("Invalid Model: must have at least two outputs"); - - // WARNING UB .. but still ... - // If the model was saved without meta data, it seems the version field is uninitialized. - // In that case reading from it is UB. However, we will just get same arbitrary number - // which is hopefully different from the numbers used by models where the version is set. - // I.e., this is what happended in practice so far. - if (model_version_ != 2) - model_version_ = 1; - - const cv::Size input_image_shape = get_input_image_shape(session_); - - scaled_frame_ = cv::Mat(input_image_shape, CV_8U, cv::Scalar(0)); - input_mat_ = cv::Mat(input_image_shape, CV_32F, cv::Scalar(0.f)); - - { - const std::int64_t input_shape[4] = { 1, 1, input_image_shape.height, input_image_shape.width }; - input_val_.push_back( - Ort::Value::CreateTensor<float>(allocator_info, input_mat_.ptr<float>(0), input_mat_.total(), input_shape, 4)); - } - - { - const std::int64_t output_shape[2] = { 1, 3 }; - output_val_.push_back(Ort::Value::CreateTensor<float>( - allocator_info, &output_coord_[0], output_coord_.rows, output_shape, 2)); - } - - { - const std::int64_t output_shape[2] = { 1, 4 }; - output_val_.push_back(Ort::Value::CreateTensor<float>( - allocator_info, &output_quat_[0], output_quat_.rows, output_shape, 2)); - } - - size_t num_regular_outputs = 2; - - if (session_.GetOutputCount() >= 3 && "box"s == session_.GetOutputName(2, allocator_)) - { - const std::int64_t output_shape[2] = { 1, 4 }; - output_val_.push_back(Ort::Value::CreateTensor<float>( - allocator_info, &output_box_[0], output_box_.rows, output_shape, 2)); - ++num_regular_outputs; - qDebug() << "Note: Legacy model output for face ROI is currently ignored"; - } - - num_recurrent_states_ = session_.GetInputCount()-1; - if (session_.GetOutputCount()-num_regular_outputs != num_recurrent_states_) - throw std::runtime_error("Invalid Model: After regular inputs and outputs the model must have equal number of inputs and outputs for tensors holding hidden states of recurrent layers."); - - // Create tensors for recurrent state - for (size_t i = 0; i < num_recurrent_states_; ++i) - { - const auto& input_info = session_.GetInputTypeInfo(1+i); - const auto& output_info = session_.GetOutputTypeInfo(num_regular_outputs+i); - if (input_info.GetTensorTypeAndShapeInfo().GetShape() != - output_info.GetTensorTypeAndShapeInfo().GetShape()) - throw std::runtime_error("Invalid Model: Tensors for recurrent hidden states should have same shape on intput and output"); - input_val_.push_back(create_tensor(input_info, allocator_)); - output_val_.push_back(create_tensor(output_info, allocator_)); - } - - for (size_t i = 0; i < session_.GetInputCount(); ++i) - { - input_names_.push_back(session_.GetInputName(i, allocator_)); - } - for (size_t i = 0; i < session_.GetOutputCount(); ++i) - { - output_names_.push_back(session_.GetOutputName(i, allocator_)); - } - - qDebug() << "Model inputs: " << session_.GetInputCount() << ", outputs: " << session_.GetOutputCount() << ", recurrent states: " << num_recurrent_states_; - - assert (input_names_.size() == input_val_.size()); - assert (output_names_.size() == output_val_.size()); + auto i = a & b; + return double{i.area()} / (a.area()+b.area()-i.area()); } -int PoseEstimator::find_input_intensity_90_pct_quantile() const +class GuardedThreadCountSwitch { - const int channels[] = { 0 }; - const int hist_size[] = { 255 }; - float range[] = { 0, 256 }; - const float* ranges[] = { range }; - cv::Mat hist; - cv::calcHist(&scaled_frame_, 1, channels, cv::Mat(), hist, 1, hist_size, ranges, true, false); - int gray_level = 0; - const int num_pixels_quantile = scaled_frame_.total()*0.9f; - int num_pixels_accum = 0; - for (int i=0; i<hist_size[0]; ++i) - { - num_pixels_accum += hist.at<float>(i); - if (num_pixels_accum > num_pixels_quantile) + int old_num_threads_cv_ = 1; + int old_num_threads_omp_ = 1; + public: + GuardedThreadCountSwitch(int num_threads) { - gray_level = i; - break; + old_num_threads_cv_ = cv::getNumThreads(); + old_num_threads_omp_ = omp_get_num_threads(); + omp_set_num_threads(num_threads); + cv::setNumThreads(num_threads); + } + + ~GuardedThreadCountSwitch() + { + omp_set_num_threads(old_num_threads_omp_); + cv::setNumThreads(old_num_threads_cv_); } - } - return gray_level; -} - - -std::optional<PoseEstimator::Face> PoseEstimator::run( - const cv::Mat &frame, const cv::Rect &box) -{ - cv::Mat cropped; - - const int patch_size = std::max(box.width, box.height)*1.05; - const cv::Point2f patch_center = { - std::clamp<float>(box.x + 0.5f*box.width, 0.f, frame.cols), - std::clamp<float>(box.y + 0.5f*box.height, 0.f, frame.rows) - }; - cv::getRectSubPix(frame, {patch_size, patch_size}, patch_center, cropped); - - // Will get failure if patch_center is outside image boundariesettings. - // Have to catch this case. - if (cropped.rows != patch_size || cropped.cols != patch_size) - return {}; - - auto p = input_mat_.ptr(0); - - cv::resize(cropped, scaled_frame_, scaled_frame_.size(), 0, 0, cv::INTER_AREA); - - // Automatic brightness amplification. - const int brightness = find_input_intensity_90_pct_quantile(); - const double alpha = brightness<127 ? 0.5/std::max(5,brightness) : 1./255; - const double beta = -0.5; - - scaled_frame_.convertTo(input_mat_, CV_32F, alpha, beta); - - assert (input_mat_.ptr(0) == p); - assert (!input_mat_.empty() && input_mat_.isContinuous()); - - - Timer t; t.start(); - - try - { - session_.Run( - Ort::RunOptions{ nullptr }, - input_names_.data(), - input_val_.data(), - input_val_.size(), - output_names_.data(), - output_val_.data(), - output_val_.size()); - } - catch (const Ort::Exception &e) - { - qDebug() << "Failed to run the model: " << e.what(); - return {}; - } - - for (size_t i = 0; i<num_recurrent_states_; ++i) - { - // Next step, the current output becomes the input. - // Thus we realize the recurrent connection. - // Only swaps the internal pointers. There is no copy of data. - std::swap( - output_val_[output_val_.size()-num_recurrent_states_+i], - input_val_[input_val_.size()-num_recurrent_states_+i]); - } - - // FIXME: Execution time fluctuates wildly. 19 to 26 msec. Why? - // The instructions are always the same. Maybe a memory allocation - // issue. The ONNX api suggests that tensor are allocated in an - // arena. Does that matter? Maybe the issue is something else? - - last_inference_time_ = t.elapsed_ms(); - - // Perform coordinate transformation. - // From patch-local normalized in [-1,1] to - // frame unnormalized pixel coordinatesettings. - - const cv::Point2f center = patch_center + - (0.5f*patch_size)*cv::Point2f{output_coord_[0], output_coord_[1]}; - - const float size = patch_size*0.5f*output_coord_[2]; - - // Following Eigen which uses quat components in the order w, x, y, z. - quat rotation = { - output_quat_[3], - output_quat_[0], - output_quat_[1], - output_quat_[2] }; - - if (model_version_ < 2) - { - // Due to a change in coordinate conventions - rotation = world_to_image(rotation); - } - - const cv::Rect2f outbox = { - patch_center.x + (0.5f*patch_size)*output_box_[0], - patch_center.y + (0.5f*patch_size)*output_box_[1], - 0.5f*patch_size*(output_box_[2]-output_box_[0]), - 0.5f*patch_size*(output_box_[3]-output_box_[1]) - }; - - return std::optional<Face>({ - rotation, outbox, center, size - }); -} - - -cv::Mat PoseEstimator::last_network_input() const -{ - assert(!input_mat_.empty()); - cv::Mat ret; - input_mat_.convertTo(ret, CV_8U, 255., 127.); - cv::cvtColor(ret, ret, cv::COLOR_GRAY2RGB); - return ret; -} - -double PoseEstimator::last_inference_time_millis() const -{ - return last_inference_time_; -} + GuardedThreadCountSwitch(const GuardedThreadCountSwitch&) = delete; + GuardedThreadCountSwitch& operator=(const GuardedThreadCountSwitch&) = delete; +}; bool NeuralNetTracker::detect() @@ -645,6 +319,8 @@ bool NeuralNetTracker::detect() if (!last_roi_) { + // Last iteration the tracker failed to generate a trustworthy + // roi and the localizer also cannot find a face. draw_gizmos({}, {}); return false; } @@ -655,37 +331,28 @@ bool NeuralNetTracker::detect() if (!face) { last_roi_.reset(); - draw_gizmos(*face, {}); + draw_gizmos({}, {}); return false; } - { - // Here: compute ROI from head size estimate. This helps make the size prediction more - // invariant to mouth opening. The tracking can be lost more often at extreme - // rotations, depending on the implementation details. The code down here has - // been tweaked so that it works pretty well. - // In old behaviour ROI is taken from the model outputs - const vec3 offset = rotate_vec(face->rotation, vec3{0.f, 0.1f*face->size, face->size*0.3f}); - const float halfsize = face->size/float(settings_.roi_zoom); - face->box = cv::Rect2f( - face->center.x + offset[0] - halfsize, - face->center.y + offset[1] - halfsize, - halfsize*2.f, - halfsize*2.f - ); - } - - last_roi_ = ewa_filter(*last_roi_, face->box, float(settings_.roi_filter_alpha)); - - Affine pose = compute_pose(*face); + cv::Rect2f roi = expand(face->box, (float)settings_.roi_zoom); - draw_gizmos(*face, pose); + last_roi_ = ewa_filter(*last_roi_, roi, float(settings_.roi_filter_alpha)); + QuatPose pose = compute_filtered_pose(*face); + last_pose_ = pose; + + Affine pose_affine = { + pose.rot.toRotMat3x3(cv::QUAT_ASSUME_UNIT), + pose.pos }; + { QMutexLocker lck(&mtx_); - this->pose_ = pose; + last_pose_affine_ = pose_affine; } + draw_gizmos(*face, last_pose_affine_); + return true; } @@ -697,168 +364,74 @@ void NeuralNetTracker::draw_gizmos( if (!is_visible_) return; - preview_.draw_gizmos(face, pose, last_roi_, last_localizer_roi_, world_to_image(pose.t, grayscale_.size(), intrinsics_)); + preview_.draw_gizmos( + face, + last_roi_, + last_localizer_roi_, + world_to_image(pose.t, grayscale_.size(), intrinsics_)); if (settings_.show_network_input) { cv::Mat netinput = poseestimator_->last_network_input(); preview_.overlay_netinput(netinput); } - - //preview_.draw_fps(fps, last_inference_time); } -Affine NeuralNetTracker::compute_pose(const PoseEstimator::Face &face) const +QuatPose NeuralNetTracker::transform_to_world_pose(const cv::Quatf &face_rotation, const cv::Point2f& face_xy, const float face_size) const { - // Compute the location the network outputs in 3d space. - - const mat33 rot_correction = compute_rotation_correction( - normalize(face.center, grayscale_.rows, grayscale_.cols), - intrinsics_.focal_length_w); - - const mat33 m = rot_correction * quaternion_to_mat33( - image_to_world(face.rotation)); - - /* - - hhhhhh <- head size (meters) - \ | ----------------------- - \ | \ - \ | | - \ | |- tz (meters) - ____ <- face.size / width | - \ | | | - \| |- focal length / - ------------------------ - */ - const vec3 face_world_pos = image_to_world( - face.center.x, face.center.y, face.size, HEAD_SIZE_MM, + face_xy.x, face_xy.y, face_size, HEAD_SIZE_MM, grayscale_.size(), intrinsics_); + const cv::Quatf rot_correction = compute_rotation_correction( + face_world_pos); + + cv::Quatf rot = rot_correction * image_to_world(face_rotation); + // But this is in general not the location of the rotation joint in the neck. - // So we need an extra offset. Which we determine by solving + // So we need an extra offset. Which we determine by computing // z,y,z-pos = head_joint_loc + R_face * offset - const vec3 pos = face_world_pos - + m * vec3{ - static_cast<float>(settings_.offset_fwd), - static_cast<float>(settings_.offset_up), - static_cast<float>(settings_.offset_right)}; + const vec3 local_offset = vec3{ + static_cast<float>(settings_.offset_fwd), + static_cast<float>(settings_.offset_up), + static_cast<float>(settings_.offset_right)}; + const vec3 offset = rotate(rot, local_offset); + const vec3 pos = face_world_pos + offset; - return { m, pos }; + return { rot, pos }; } -void Preview::init(const cv_video_widget& widget) +QuatPose NeuralNetTracker::compute_filtered_pose(const PoseEstimator::Face &face) { - auto [w,h] = widget.preview_size(); - preview_size_ = { w, h }; -} - - -void Preview::copy_video_frame(const cv::Mat& frame) -{ - cv::Rect roi = make_crop_rect_for_aspect(frame.size(), preview_size_.width, preview_size_.height); - - cv::resize(frame(roi), preview_image_, preview_size_, 0, 0, cv::INTER_NEAREST); - - offset_ = { (float)-roi.x, (float)-roi.y }; - scale_ = float(preview_image_.cols) / float(roi.width); -} - - -void Preview::draw_gizmos( - const std::optional<PoseEstimator::Face> &face, - const Affine& pose, - const std::optional<cv::Rect2f>& last_roi, - const std::optional<cv::Rect2f>& last_localizer_roi, - const cv::Point2f& neckjoint_position) -{ - if (preview_image_.empty()) - return; - - if (last_roi) + if (fps_ > 0.01 && last_pose_ && poseestimator_->has_uncertainty()) { - const int col = 255; - cv::rectangle(preview_image_, transform(*last_roi), cv::Scalar(0, col, 0), /*thickness=*/1); - } - if (last_localizer_roi) - { - const int col = 255; - cv::rectangle(preview_image_, transform(*last_localizer_roi), cv::Scalar(col, 0, 255-col), /*thickness=*/1); + auto image2world = [this](const cv::Quatf &face_rotation, const cv::Point2f& face_xy, const float face_size) { + return this->transform_to_world_pose(face_rotation, face_xy, face_size); }; + + return apply_filter( + face, + *last_pose_, + 1./fps_, + std::move(image2world), + FiltParams{ + float(settings_.deadzone_hardness), + float(settings_.deadzone_size) + }); } - - if (face) + else { - if (face->size>=1.f) - cv::circle(preview_image_, static_cast<cv::Point>(transform(face->center)), int(transform(face->size)), cv::Scalar(255,255,255), 2); - cv::circle(preview_image_, static_cast<cv::Point>(transform(face->center)), 3, cv::Scalar(255,255,255), -1); - - auto draw_coord_line = [&](int i, const cv::Scalar& color) - { - const float vx = -pose.R(2,i); - const float vy = -pose.R(1,i); - static constexpr float len = 100.f; - cv::Point q = face->center + len*cv::Point2f{vx, vy}; - cv::line(preview_image_, static_cast<cv::Point>(transform(face->center)), static_cast<cv::Point>(transform(q)), color, 2); - }; - draw_coord_line(0, {0, 0, 255}); - draw_coord_line(1, {0, 255, 0}); - draw_coord_line(2, {255, 0, 0}); - - // Draw the computed joint position - auto xy = transform(neckjoint_position); - cv::circle(preview_image_, cv::Point(xy.x,xy.y), 5, cv::Scalar(0,0,255), -1); + return transform_to_world_pose(face.rotation, face.center, face.size); } } -void Preview::overlay_netinput(const cv::Mat& netinput) -{ - if (netinput.empty()) - return; - - const int w = std::min(netinput.cols, preview_image_.cols); - const int h = std::min(netinput.rows, preview_image_.rows); - cv::Rect roi(0, 0, w, h); - netinput(roi).copyTo(preview_image_(roi)); -} - -void Preview::draw_fps(double fps, double last_inference_time) -{ - char buf[128]; - ::snprintf(buf, sizeof(buf), "%d Hz, pose inference: %d ms", std::clamp(int(fps), 0, 9999), int(last_inference_time)); - cv::putText(preview_image_, buf, cv::Point(10, preview_image_.rows-10), cv::FONT_HERSHEY_PLAIN, 1, cv::Scalar(0, 255, 0), 1); -} - - -void Preview::copy_to_widget(cv_video_widget& widget) -{ - if (preview_image_.rows > 0) - widget.update_image(preview_image_); -} - - -cv::Rect2f Preview::transform(const cv::Rect2f& r) const -{ - return { (r.x - offset_.x)*scale_, (r.y - offset_.y)*scale_, r.width*scale_, r.height*scale_ }; -} - -cv::Point2f Preview::transform(const cv::Point2f& p) const -{ - return { (p.x - offset_.x)*scale_ , (p.y - offset_.y)*scale_ }; -} - -float Preview::transform(float s) const -{ - return s * scale_; -} - NeuralNetTracker::NeuralNetTracker() { opencv_init(); + neuralnet_tracker_tests::run(); } @@ -921,6 +494,7 @@ bool NeuralNetTracker::load_and_initialize_model() << e.what(); return false; } + return true; } @@ -960,46 +534,6 @@ bool NeuralNetTracker::open_camera() } -void NeuralNetTracker::set_intrinsics() -{ - const int w = grayscale_.cols, h = grayscale_.rows; - const double diag_fov = settings_.fov * M_PI / 180.; - const double fov_w = 2.*atan(tan(diag_fov/2.)/sqrt(1. + h/(double)w * h/(double)w)); - const double fov_h = 2.*atan(tan(diag_fov/2.)/sqrt(1. + w/(double)h * w/(double)h)); - const double focal_length_w = 1. / tan(.5 * fov_w); - const double focal_length_h = 1. / tan(.5 * fov_h); - - intrinsics_.fov_h = fov_h; - intrinsics_.fov_w = fov_w; - intrinsics_.focal_length_w = focal_length_w; - intrinsics_.focal_length_h = focal_length_h; -} - - -class GuardedThreadCountSwitch -{ - int old_num_threads_cv_ = 1; - int old_num_threads_omp_ = 1; - public: - GuardedThreadCountSwitch(int num_threads) - { - old_num_threads_cv_ = cv::getNumThreads(); - old_num_threads_omp_ = omp_get_num_threads(); - omp_set_num_threads(num_threads); - cv::setNumThreads(num_threads); - } - - ~GuardedThreadCountSwitch() - { - omp_set_num_threads(old_num_threads_omp_); - cv::setNumThreads(old_num_threads_cv_); - } - - GuardedThreadCountSwitch(const GuardedThreadCountSwitch&) = delete; - GuardedThreadCountSwitch& operator=(const GuardedThreadCountSwitch&) = delete; -}; - - void NeuralNetTracker::run() { preview_.init(*video_widget_); @@ -1055,7 +589,7 @@ void NeuralNetTracker::run() } } - set_intrinsics(); + intrinsics_ = make_intrinsics(grayscale_, settings_); detect(); @@ -1113,7 +647,7 @@ void NeuralNetTracker::data(double *data) Affine tmp = [&]() { QMutexLocker lck(&mtx_); - return pose_; + return last_pose_affine_; }(); const auto& mx = tmp.R.col(0); @@ -1140,7 +674,7 @@ void NeuralNetTracker::data(double *data) Affine NeuralNetTracker::pose() { QMutexLocker lck(&mtx_); - return pose_; + return last_pose_affine_; } std::tuple<cv::Size,double, double> NeuralNetTracker::stats() const |