diff options
author | Stanislaw Halik <sthalik@misaki.pl> | 2022-05-26 18:04:23 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-05-26 18:04:23 +0000 |
commit | 015d1298cb8f853cc4b0cae6a62c13af130791b7 (patch) | |
tree | 2df114150815a442c710532914ba777067b317eb | |
parent | db54d8eb86b06e6c7ea44ff0c112c22ab85305c3 (diff) | |
parent | bbce410b03c2bb36cf2a07b1394fa04bf4f04c70 (diff) |
Merge pull request #1476 from DaWelter/master
-rw-r--r-- | tracker-neuralnet/ftnoir_tracker_neuralnet.cpp | 990 | ||||
-rw-r--r-- | tracker-neuralnet/ftnoir_tracker_neuralnet.h | 191 | ||||
-rw-r--r-- | tracker-neuralnet/lang/nl_NL.ts | 58 | ||||
-rw-r--r-- | tracker-neuralnet/lang/ru_RU.ts | 58 | ||||
-rw-r--r-- | tracker-neuralnet/lang/stub.ts | 58 | ||||
-rw-r--r-- | tracker-neuralnet/lang/zh_CN.ts | 58 | ||||
-rw-r--r-- | tracker-neuralnet/neuralnet-trackercontrols.ui | 794 |
7 files changed, 1587 insertions, 620 deletions
diff --git a/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp b/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp index 3687a6cd..62209978 100644 --- a/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp +++ b/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp @@ -15,6 +15,7 @@ #include <opencv2/calib3d.hpp> #include <opencv2/imgcodecs.hpp> #include "compat/timer.hpp" +#include "compat/check-visible.hpp" #include <omp.h> #ifdef _MSC_VER @@ -29,20 +30,22 @@ #include <cmath> #include <algorithm> #include <chrono> +#include <string> +#include <stdexcept> + // Some demo code for onnx // https://github.com/microsoft/onnxruntime/blob/master/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/C_Api_Sample.cpp // https://github.com/leimao/ONNX-Runtime-Inference/blob/main/src/inference.cpp -namespace +namespace neuralnet_tracker_ns { + using numeric_types::vec3; using numeric_types::vec2; using numeric_types::mat33; - -// Minimal difference if at all going from 1 to 2 threads. -static constexpr int num_threads = 1; +using quat = std::array<numeric_types::f,4>; #if _MSC_VER @@ -52,12 +55,52 @@ std::string convert(const QString &s) { return s.toStdString(); } #endif +template<class F> +struct OnScopeExit +{ + explicit OnScopeExit(F&& f) : f_{ f } {} + ~OnScopeExit() noexcept + { + f_(); + } + F f_; +}; + + float sigmoid(float x) { return 1.f/(1.f + std::exp(-x)); } +cv::Rect make_crop_rect_for_aspect(const cv::Size &size, int aspect_w, int aspect_h) +{ + auto [w, h] = size; + if ( w*aspect_h > aspect_w*h ) + { + // Image is too wide + const int new_w = (aspect_w*h)/aspect_h; + return cv::Rect((w - new_w)/2, 0, new_w, h); + } + else + { + const int new_h = (aspect_h*w)/aspect_w; + return cv::Rect(0, (h - new_h)/2, w, new_h); + } +} + +cv::Rect make_crop_rect_multiple_of(const cv::Size &size, int multiple) +{ + const int new_w = (size.width / multiple) * multiple; + const int new_h = (size.height / multiple) * multiple; + return cv::Rect( + (size.width-new_w)/2, + (size.height-new_h)/2, + new_w, + new_h + ); +} + template<class T> cv::Rect_<T> squarize(const cv::Rect_<T> &r) { @@ -67,6 +110,41 @@ cv::Rect_<T> squarize(const cv::Rect_<T> &r) } +template<class T> +cv::Point_<T> as_point(const cv::Size_<T>& s) +{ + return { s.width, s.height }; +} + + +template<class T> +cv::Size_<T> as_size(const cv::Point_<T>& p) +{ + return { p.x, p.y }; +} + + +template<class T> +cv::Rect_<T> expand(const cv::Rect_<T>& r, T factor) +{ + // xnew = l+.5*w - w*f*0.5 = l + .5*(w - new_w) + const cv::Size_<T> new_size = { r.width * factor, r.height * factor }; + const cv::Point_<T> new_tl = r.tl() + (as_point(r.size()) - as_point(new_size)) / T(2); + return cv::Rect_<T>(new_tl, new_size); +} + + +template<class T> +cv::Rect_<T> ewa_filter(const cv::Rect_<T>& last, const cv::Rect_<T>& current, T alpha) +{ + const auto last_center = T(0.5) * (last.tl() + last.br()); + const auto cur_center = T(0.5) * (current.tl() + current.br()); + const cv::Point_<T> new_size = as_point(last.size()) + alpha * (as_point(current.size()) - as_point(last.size())); + const cv::Point_<T> new_center = last_center + alpha * (cur_center - last_center); + return cv::Rect_<T>(new_center - T(0.5) * new_size, as_size(new_size)); +} + + cv::Rect2f unnormalize(const cv::Rect2f &r, int h, int w) { auto unnorm = [](float x) -> float { return 0.5*(x+1); }; @@ -105,10 +183,9 @@ mat33 rotation_from_two_vectors(const vec3 &a, const vec3 &b) } -/* Computes correction due to head being off screen center. - x, y: In screen space, i.e. in [-1,1] - focal_length_x: In screen space -*/ +// Computes correction due to head being off screen center. +// x, y: In screen space, i.e. in [-1,1] +// focal_length_x: In screen space mat33 compute_rotation_correction(const cv::Point2f &p, float focal_length_x) { return rotation_from_two_vectors( @@ -137,6 +214,66 @@ mat33 quaternion_to_mat33(const std::array<float,4> quat) } +vec3 rotate_vec(const quat& q, const vec3& p) +{ + const float qw = q[0]; + const float qi = q[1]; + const float qj = q[2]; + const float qk = q[3]; + const float pi = p[0]; + const float pj = p[1]; + const float pk = p[2]; + const quat tmp{ + - qi*pi - qj*pj - qk*pk, + qw*pi + qj*pk - qk*pj, + qw*pj - qi*pk + qk*pi, + qw*pk + qi*pj - qj*pi + }; + const vec3 out { + -tmp[0]*qi + tmp[1]*qw - tmp[2]*qk + tmp[3]*qj, + -tmp[0]*qj + tmp[1]*qk + tmp[2]*qw - tmp[3]*qi, + -tmp[0]*qk - tmp[1]*qj + tmp[2]*qi + tmp[3]*qw + }; + return out; +} + + +vec3 image_to_world(float x, float y, float size, float reference_size_in_mm, const cv::Size2i& image_size, const CamIntrinsics& intrinsics) +{ + // Compute the location the network outputs in 3d space. + const float xpos = -(intrinsics.focal_length_w * image_size.width * 0.5f) / size * reference_size_in_mm; + const float zpos = (x / image_size.width * 2.f - 1.f) * xpos / intrinsics.focal_length_w; + const float ypos = (y / image_size.height * 2.f - 1.f) * xpos / intrinsics.focal_length_h; + return {xpos, ypos, zpos}; +} + + +vec2 world_to_image(const vec3& pos, const cv::Size2i& image_size, const CamIntrinsics& intrinsics) +{ + const float xscr = pos[2] / pos[0] * intrinsics.focal_length_w; + const float yscr = pos[1] / pos[0] * intrinsics.focal_length_h; + const float x = (xscr+1.)*0.5f*image_size.width; + const float y = (yscr+1.)*0.5f*image_size.height; + return {x, y}; +} + + +quat image_to_world(quat q) +{ + std::swap(q[1], q[3]); + q[1] = -q[1]; + q[2] = -q[2]; + q[3] = -q[3]; + return q; +} + +quat world_to_image(quat q) +{ + // It's its own inverse. + return image_to_world(q); +} + +// Intersection over union. A value between 0 and 1 which measures the match between the bounding boxes. template<class T> T iou(const cv::Rect_<T> &a, const cv::Rect_<T> &b) { @@ -144,12 +281,30 @@ T iou(const cv::Rect_<T> &a, const cv::Rect_<T> &b) return double{i.area()} / (a.area()+b.area()-i.area()); } - -} // namespace +// Returns width and height of the input tensor, or throws. +// Expects the model to take one tensor as input that must +// have the shape B x C x H x W, where B=C=1. +cv::Size get_input_image_shape(const Ort::Session &session) +{ + if (session.GetInputCount() < 1) + throw std::invalid_argument("Model must take at least one input tensor"); + const std::vector<std::int64_t> shape = + session.GetInputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape(); + if (shape.size() != 4) + throw std::invalid_argument("Model takes the input tensor in the wrong shape"); + return { static_cast<int>(shape[3]), static_cast<int>(shape[2]) }; +} -namespace neuralnet_tracker_ns +Ort::Value create_tensor(const Ort::TypeInfo& info, Ort::Allocator& alloc) { + const auto shape = info.GetTensorTypeAndShapeInfo().GetShape(); + auto t = Ort::Value::CreateTensor<float>( + alloc, shape.data(), shape.size()); + memset(t.GetTensorMutableData<float>(), 0, sizeof(float)*info.GetTensorTypeAndShapeInfo().GetElementCount()); + return t; +} + int enum_to_fps(int value) @@ -165,52 +320,49 @@ int enum_to_fps(int value) Localizer::Localizer(Ort::MemoryInfo &allocator_info, Ort::Session &&session) : - session{std::move(session)}, - scaled_frame(input_img_height, input_img_width, CV_8U), - input_mat(input_img_height, input_img_width, CV_32F) + session_{std::move(session)}, + scaled_frame_(INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH, CV_8U), + input_mat_(INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH, CV_32F) { // Only works when input_mat does not reallocated memory ...which it should not. // Non-owning memory reference to input_mat? // Note: shape = (bach x channels x h x w) - const std::int64_t input_shape[4] = { 1, 1, input_img_height, input_img_width }; - input_val = Ort::Value::CreateTensor<float>(allocator_info, input_mat.ptr<float>(0), input_mat.total(), input_shape, 4); + const std::int64_t input_shape[4] = { 1, 1, INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH }; + input_val_ = Ort::Value::CreateTensor<float>(allocator_info, input_mat_.ptr<float>(0), input_mat_.total(), input_shape, 4); const std::int64_t output_shape[2] = { 1, 5 }; - output_val = Ort::Value::CreateTensor<float>(allocator_info, results.data(), results.size(), output_shape, 2); + output_val_ = Ort::Value::CreateTensor<float>(allocator_info, results_.data(), results_.size(), output_shape, 2); } std::pair<float, cv::Rect2f> Localizer::run( const cv::Mat &frame) { - auto p = input_mat.ptr(0); + auto p = input_mat_.ptr(0); - cv::resize(frame, scaled_frame, { input_img_width, input_img_height }, 0, 0, cv::INTER_AREA); - scaled_frame.convertTo(input_mat, CV_32F, 1./255., -0.5); + cv::resize(frame, scaled_frame_, { INPUT_IMG_WIDTH, INPUT_IMG_HEIGHT }, 0, 0, cv::INTER_AREA); + scaled_frame_.convertTo(input_mat_, CV_32F, 1./255., -0.5); - assert (input_mat.ptr(0) == p); - assert (!input_mat.empty() && input_mat.isContinuous()); - assert (input_mat.cols == input_img_width && input_mat.rows == input_img_height); + assert (input_mat_.ptr(0) == p); + assert (!input_mat_.empty() && input_mat_.isContinuous()); + assert (input_mat_.cols == INPUT_IMG_WIDTH && input_mat_.rows == INPUT_IMG_HEIGHT); const char* input_names[] = {"x"}; const char* output_names[] = {"logit_box"}; Timer t; t.start(); - const auto nt = omp_get_num_threads(); - omp_set_num_threads(num_threads); - session.Run(Ort::RunOptions{nullptr}, input_names, &input_val, 1, output_names, &output_val, 1); - omp_set_num_threads(nt); + session_.Run(Ort::RunOptions{nullptr}, input_names, &input_val_, 1, output_names, &output_val_, 1); - last_inference_time = t.elapsed_ms(); + last_inference_time_ = t.elapsed_ms(); const cv::Rect2f roi = unnormalize(cv::Rect2f{ - results[1], - results[2], - results[3]-results[1], // Width - results[4]-results[2] // Height + results_[1], + results_[2], + results_[3]-results_[1], // Width + results_[4]-results_[2] // Height }, frame.rows, frame.cols); - const float score = sigmoid(results[0]); + const float score = sigmoid(results_[0]); return { score, roi }; } @@ -218,37 +370,91 @@ std::pair<float, cv::Rect2f> Localizer::run( double Localizer::last_inference_time_millis() const { - return last_inference_time; + return last_inference_time_; } -PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&session) : - session{std::move(session)}, - scaled_frame(input_img_height, input_img_width, CV_8U), - input_mat(input_img_height, input_img_width, CV_32F) +PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&_session) + : model_version_{_session.GetModelMetadata().GetVersion()} + , session_{std::move(_session)} + , allocator_{session_, allocator_info} { + using namespace std::literals::string_literals; + + if (session_.GetOutputCount() < 2) + throw std::runtime_error("Invalid Model: must have at least two outputs"); + + // WARNING UB .. but still ... + // If the model was saved without meta data, it seems the version field is uninitialized. + // In that case reading from it is UB. However, we will just get same arbitrary number + // which is hopefully different from the numbers used by models where the version is set. + // I.e., this is what happended in practice so far. + if (model_version_ != 2) + model_version_ = 1; + + const cv::Size input_image_shape = get_input_image_shape(session_); + + scaled_frame_ = cv::Mat(input_image_shape, CV_8U, cv::Scalar(0)); + input_mat_ = cv::Mat(input_image_shape, CV_32F, cv::Scalar(0.f)); + { - const std::int64_t input_shape[4] = { 1, 1, input_img_height, input_img_width }; - input_val = Ort::Value::CreateTensor<float>(allocator_info, input_mat.ptr<float>(0), input_mat.total(), input_shape, 4); + const std::int64_t input_shape[4] = { 1, 1, input_image_shape.height, input_image_shape.width }; + input_val_.push_back( + Ort::Value::CreateTensor<float>(allocator_info, input_mat_.ptr<float>(0), input_mat_.total(), input_shape, 4)); } { const std::int64_t output_shape[2] = { 1, 3 }; - output_val[0] = Ort::Value::CreateTensor<float>( - allocator_info, &output_coord[0], output_coord.rows, output_shape, 2); + output_val_.push_back(Ort::Value::CreateTensor<float>( + allocator_info, &output_coord_[0], output_coord_.rows, output_shape, 2)); } { const std::int64_t output_shape[2] = { 1, 4 }; - output_val[1] = Ort::Value::CreateTensor<float>( - allocator_info, &output_quat[0], output_quat.rows, output_shape, 2); + output_val_.push_back(Ort::Value::CreateTensor<float>( + allocator_info, &output_quat_[0], output_quat_.rows, output_shape, 2)); } + size_t num_regular_outputs = 2; + + if (session_.GetOutputCount() >= 3 && "box"s == session_.GetOutputName(2, allocator_)) { const std::int64_t output_shape[2] = { 1, 4 }; - output_val[2] = Ort::Value::CreateTensor<float>( - allocator_info, &output_box[0], output_box.rows, output_shape, 2); + output_val_.push_back(Ort::Value::CreateTensor<float>( + allocator_info, &output_box_[0], output_box_.rows, output_shape, 2)); + ++num_regular_outputs; + qDebug() << "Note: Legacy model output for face ROI is currently ignored"; + } + + num_recurrent_states_ = session_.GetInputCount()-1; + if (session_.GetOutputCount()-num_regular_outputs != num_recurrent_states_) + throw std::runtime_error("Invalid Model: After regular inputs and outputs the model must have equal number of inputs and outputs for tensors holding hidden states of recurrent layers."); + + // Create tensors for recurrent state + for (size_t i = 0; i < num_recurrent_states_; ++i) + { + const auto& input_info = session_.GetInputTypeInfo(1+i); + const auto& output_info = session_.GetOutputTypeInfo(num_regular_outputs+i); + if (input_info.GetTensorTypeAndShapeInfo().GetShape() != + output_info.GetTensorTypeAndShapeInfo().GetShape()) + throw std::runtime_error("Invalid Model: Tensors for recurrent hidden states should have same shape on intput and output"); + input_val_.push_back(create_tensor(input_info, allocator_)); + output_val_.push_back(create_tensor(output_info, allocator_)); + } + + for (size_t i = 0; i < session_.GetInputCount(); ++i) + { + input_names_.push_back(session_.GetInputName(i, allocator_)); + } + for (size_t i = 0; i < session_.GetOutputCount(); ++i) + { + output_names_.push_back(session_.GetOutputName(i, allocator_)); } + + qDebug() << "Model inputs: " << session_.GetInputCount() << ", outputs: " << session_.GetOutputCount() << ", recurrent states: " << num_recurrent_states_; + + assert (input_names_.size() == input_val_.size()); + assert (output_names_.size() == output_val_.size()); } @@ -259,9 +465,9 @@ int PoseEstimator::find_input_intensity_90_pct_quantile() const float range[] = { 0, 256 }; const float* ranges[] = { range }; cv::Mat hist; - cv::calcHist(&scaled_frame, 1, channels, cv::Mat(), hist, 1, hist_size, ranges, true, false); + cv::calcHist(&scaled_frame_, 1, channels, cv::Mat(), hist, 1, hist_size, ranges, true, false); int gray_level = 0; - const int num_pixels_quantile = scaled_frame.total()*0.9f; + const int num_pixels_quantile = scaled_frame_.total()*0.9f; int num_pixels_accum = 0; for (int i=0; i<hist_size[0]; ++i) { @@ -280,7 +486,7 @@ std::optional<PoseEstimator::Face> PoseEstimator::run( const cv::Mat &frame, const cv::Rect &box) { cv::Mat cropped; - + const int patch_size = std::max(box.width, box.height)*1.05; const cv::Point2f patch_center = { std::clamp<float>(box.x + 0.5f*box.width, 0.f, frame.cols), @@ -288,64 +494,89 @@ std::optional<PoseEstimator::Face> PoseEstimator::run( }; cv::getRectSubPix(frame, {patch_size, patch_size}, patch_center, cropped); - // Will get failure if patch_center is outside image boundaries. + // Will get failure if patch_center is outside image boundariesettings. // Have to catch this case. if (cropped.rows != patch_size || cropped.cols != patch_size) return {}; - - auto p = input_mat.ptr(0); - cv::resize(cropped, scaled_frame, { input_img_width, input_img_height }, 0, 0, cv::INTER_AREA); + auto p = input_mat_.ptr(0); + + cv::resize(cropped, scaled_frame_, scaled_frame_.size(), 0, 0, cv::INTER_AREA); // Automatic brightness amplification. const int brightness = find_input_intensity_90_pct_quantile(); const double alpha = brightness<127 ? 0.5/std::max(5,brightness) : 1./255; const double beta = -0.5; - scaled_frame.convertTo(input_mat, CV_32F, alpha, beta); + scaled_frame_.convertTo(input_mat_, CV_32F, alpha, beta); - assert (input_mat.ptr(0) == p); - assert (!input_mat.empty() && input_mat.isContinuous()); - assert (input_mat.cols == input_img_width && input_mat.rows == input_img_height); + assert (input_mat_.ptr(0) == p); + assert (!input_mat_.empty() && input_mat_.isContinuous()); - const char* input_names[] = {"x"}; - const char* output_names[] = {"pos_size", "quat", "box"}; Timer t; t.start(); - const auto nt = omp_get_num_threads(); - omp_set_num_threads(num_threads); - session.Run(Ort::RunOptions{nullptr}, input_names, &input_val, 1, output_names, output_val, 3); - omp_set_num_threads(nt); + try + { + session_.Run( + Ort::RunOptions{ nullptr }, + input_names_.data(), + input_val_.data(), + input_val_.size(), + output_names_.data(), + output_val_.data(), + output_val_.size()); + } + catch (const Ort::Exception &e) + { + qDebug() << "Failed to run the model: " << e.what(); + return {}; + } + + for (size_t i = 0; i<num_recurrent_states_; ++i) + { + // Next step, the current output becomes the input. + // Thus we realize the recurrent connection. + // Only swaps the internal pointers. There is no copy of data. + std::swap( + output_val_[output_val_.size()-num_recurrent_states_+i], + input_val_[input_val_.size()-num_recurrent_states_+i]); + } - // FIXME: Execution time fluctuates wildly. 19 to 26 ms. Why??? + // FIXME: Execution time fluctuates wildly. 19 to 26 msec. Why? // The instructions are always the same. Maybe a memory allocation // issue. The ONNX api suggests that tensor are allocated in an // arena. Does that matter? Maybe the issue is something else? - last_inference_time = t.elapsed_ms(); + last_inference_time_ = t.elapsed_ms(); // Perform coordinate transformation. // From patch-local normalized in [-1,1] to - // frame unnormalized pixel coordinates. + // frame unnormalized pixel coordinatesettings. const cv::Point2f center = patch_center + - (0.5f*patch_size)*cv::Point2f{output_coord[0], output_coord[1]}; + (0.5f*patch_size)*cv::Point2f{output_coord_[0], output_coord_[1]}; - const float size = patch_size*0.5f*output_coord[2]; + const float size = patch_size*0.5f*output_coord_[2]; // Following Eigen which uses quat components in the order w, x, y, z. - const std::array<float,4> rotation = { - output_quat[3], - output_quat[0], - output_quat[1], - output_quat[2] }; + quat rotation = { + output_quat_[3], + output_quat_[0], + output_quat_[1], + output_quat_[2] }; + + if (model_version_ < 2) + { + // Due to a change in coordinate conventions + rotation = world_to_image(rotation); + } const cv::Rect2f outbox = { - patch_center.x + (0.5f*patch_size)*output_box[0], - patch_center.y + (0.5f*patch_size)*output_box[1], - 0.5f*patch_size*(output_box[2]-output_box[0]), - 0.5f*patch_size*(output_box[3]-output_box[1]) + patch_center.x + (0.5f*patch_size)*output_box_[0], + patch_center.y + (0.5f*patch_size)*output_box_[1], + 0.5f*patch_size*(output_box_[2]-output_box_[0]), + 0.5f*patch_size*(output_box_[3]-output_box_[1]) }; return std::optional<Face>({ @@ -356,66 +587,102 @@ std::optional<PoseEstimator::Face> PoseEstimator::run( cv::Mat PoseEstimator::last_network_input() const { + assert(!input_mat_.empty()); cv::Mat ret; - if (!input_mat.empty()) - { - input_mat.convertTo(ret, CV_8U, 255., 127.); - cv::cvtColor(ret, ret, cv::COLOR_GRAY2RGB); - } + input_mat_.convertTo(ret, CV_8U, 255., 127.); + cv::cvtColor(ret, ret, cv::COLOR_GRAY2RGB); return ret; } double PoseEstimator::last_inference_time_millis() const { - return last_inference_time; + return last_inference_time_; } -bool neuralnet_tracker::detect() +bool NeuralNetTracker::detect() { - // Note: BGR colors! - if (!last_localizer_roi || !last_roi || - iou(*last_localizer_roi,*last_roi)<0.25) + double inference_time = 0.; + + OnScopeExit update_inference_time{ [&]() { + + QMutexLocker lck{ &stats_mtx_ }; + inference_time_ = inference_time; + } }; + + // If there is no past ROI from the localizer or if the match of its output + // with the current ROI is too poor we have to run it again. This causes a + // latency spike of maybe an additional 50%. But it only occurs when the user + // moves his head far enough - or when the tracking ist lost ... + if (!last_localizer_roi_ || !last_roi_ || + iou(*last_localizer_roi_,*last_roi_)<0.25) { - auto [p, rect] = localizer->run(grayscale); - last_inference_time += localizer->last_inference_time_millis(); - if (p > 0.5 || rect.height < 5 || rect.width < 5) + auto [p, rect] = localizer_->run(grayscale_); + inference_time += localizer_->last_inference_time_millis(); + + if (last_roi_ && iou(rect,*last_roi_)>=0.25 && p > 0.5) + { + // The new ROI matches the result from tracking, so the user is + // still there and to not disturb recurrent models, we only update + // ... + last_localizer_roi_ = rect; + } + else if (p > 0.5 && rect.height > 32 && rect.width > 32) { - last_localizer_roi = rect; - last_roi = rect; + // Tracking probably got lost since the ROI's don't match, but the + // localizer still finds a face, so we use the ROI from the localizer + last_localizer_roi_ = rect; + last_roi_ = rect; } else { - last_roi.reset(); - last_localizer_roi.reset(); + // Tracking lost and no localization result. The user probably can't be seen. + last_roi_.reset(); + last_localizer_roi_.reset(); } } - if (!last_roi) + if (!last_roi_) { - draw_gizmos(frame, {}, {}); + draw_gizmos({}, {}); return false; } - auto face = poseestimator->run(grayscale, *last_roi); - last_inference_time += poseestimator->last_inference_time_millis(); - + auto face = poseestimator_->run(grayscale_, *last_roi_); + inference_time += poseestimator_->last_inference_time_millis(); + if (!face) { - last_roi.reset(); - draw_gizmos(frame, *face, {}); + last_roi_.reset(); + draw_gizmos(*face, {}); return false; } - last_roi = face->box; + { + // Here: compute ROI from head size estimate. This helps make the size prediction more + // invariant to mouth opening. The tracking can be lost more often at extreme + // rotations, depending on the implementation details. The code down here has + // been tweaked so that it works pretty well. + // In old behaviour ROI is taken from the model outputs + const vec3 offset = rotate_vec(face->rotation, vec3{0.f, 0.1f*face->size, face->size*0.3f}); + const float halfsize = face->size/float(settings_.roi_zoom); + face->box = cv::Rect2f( + face->center.x + offset[0] - halfsize, + face->center.y + offset[1] - halfsize, + halfsize*2.f, + halfsize*2.f + ); + } + + last_roi_ = ewa_filter(*last_roi_, face->box, float(settings_.roi_filter_alpha)); Affine pose = compute_pose(*face); - draw_gizmos(frame, *face, pose); + draw_gizmos(*face, pose); { - QMutexLocker lck(&mtx); + QMutexLocker lck(&mtx_); this->pose_ = pose; } @@ -423,13 +690,35 @@ bool neuralnet_tracker::detect() } -Affine neuralnet_tracker::compute_pose(const PoseEstimator::Face &face) const +void NeuralNetTracker::draw_gizmos( + const std::optional<PoseEstimator::Face> &face, + const Affine& pose) { + if (!is_visible_) + return; + + preview_.draw_gizmos(face, pose, last_roi_, last_localizer_roi_, world_to_image(pose.t, grayscale_.size(), intrinsics_)); + + if (settings_.show_network_input) + { + cv::Mat netinput = poseestimator_->last_network_input(); + preview_.overlay_netinput(netinput); + } + + //preview_.draw_fps(fps, last_inference_time); +} + + +Affine NeuralNetTracker::compute_pose(const PoseEstimator::Face &face) const +{ + // Compute the location the network outputs in 3d space. + const mat33 rot_correction = compute_rotation_correction( - normalize(face.center, frame.rows, frame.cols), - intrinsics.focal_length_w); + normalize(face.center, grayscale_.rows, grayscale_.cols), + intrinsics_.focal_length_w); - const mat33 m = rot_correction * quaternion_to_mat33(face.rotation); + const mat33 m = rot_correction * quaternion_to_mat33( + image_to_world(face.rotation)); /* @@ -444,44 +733,68 @@ Affine neuralnet_tracker::compute_pose(const PoseEstimator::Face &face) const ------------------------ */ - // Compute the location the network outputs in 3d space. - const vec3 face_world_pos = image_to_world(face.center.x, face.center.y, face.size, head_size_mm); + const vec3 face_world_pos = image_to_world( + face.center.x, face.center.y, face.size, HEAD_SIZE_MM, + grayscale_.size(), + intrinsics_); // But this is in general not the location of the rotation joint in the neck. // So we need an extra offset. Which we determine by solving // z,y,z-pos = head_joint_loc + R_face * offset - const vec3 pos = face_world_pos + m * vec3{ - static_cast<float>(s.offset_fwd), - static_cast<float>(s.offset_up), - static_cast<float>(s.offset_right)}; + static_cast<float>(settings_.offset_fwd), + static_cast<float>(settings_.offset_up), + static_cast<float>(settings_.offset_right)}; return { m, pos }; } -void neuralnet_tracker::draw_gizmos( - cv::Mat frame, +void Preview::init(const cv_video_widget& widget) +{ + auto [w,h] = widget.preview_size(); + preview_size_ = { w, h }; +} + + +void Preview::copy_video_frame(const cv::Mat& frame) +{ + cv::Rect roi = make_crop_rect_for_aspect(frame.size(), preview_size_.width, preview_size_.height); + + cv::resize(frame(roi), preview_image_, preview_size_, 0, 0, cv::INTER_NEAREST); + + offset_ = { (float)-roi.x, (float)-roi.y }; + scale_ = float(preview_image_.cols) / float(roi.width); +} + + +void Preview::draw_gizmos( const std::optional<PoseEstimator::Face> &face, - const Affine& pose) const + const Affine& pose, + const std::optional<cv::Rect2f>& last_roi, + const std::optional<cv::Rect2f>& last_localizer_roi, + const cv::Point2f& neckjoint_position) { + if (preview_image_.empty()) + return; + if (last_roi) { const int col = 255; - cv::rectangle(frame, *last_roi, cv::Scalar(0, col, 0), /*thickness=*/1); + cv::rectangle(preview_image_, transform(*last_roi), cv::Scalar(0, col, 0), /*thickness=*/1); } if (last_localizer_roi) { const int col = 255; - cv::rectangle(frame, *last_localizer_roi, cv::Scalar(col, 0, 255-col), /*thickness=*/1); + cv::rectangle(preview_image_, transform(*last_localizer_roi), cv::Scalar(col, 0, 255-col), /*thickness=*/1); } if (face) { if (face->size>=1.f) - cv::circle(frame, static_cast<cv::Point>(face->center), int(face->size), cv::Scalar(255,255,255), 2); - cv::circle(frame, static_cast<cv::Point>(face->center), 3, cv::Scalar(255,255,255), -1); + cv::circle(preview_image_, static_cast<cv::Point>(transform(face->center)), int(transform(face->size)), cv::Scalar(255,255,255), 2); + cv::circle(preview_image_, static_cast<cv::Point>(transform(face->center)), 3, cv::Scalar(255,255,255), -1); auto draw_coord_line = [&](int i, const cv::Scalar& color) { @@ -489,43 +802,67 @@ void neuralnet_tracker::draw_gizmos( const float vy = -pose.R(1,i); static constexpr float len = 100.f; cv::Point q = face->center + len*cv::Point2f{vx, vy}; - cv::line(frame, static_cast<cv::Point>(face->center), static_cast<cv::Point>(q), color, 2); + cv::line(preview_image_, static_cast<cv::Point>(transform(face->center)), static_cast<cv::Point>(transform(q)), color, 2); }; draw_coord_line(0, {0, 0, 255}); draw_coord_line(1, {0, 255, 0}); draw_coord_line(2, {255, 0, 0}); // Draw the computed joint position - auto xy = world_to_image(pose.t); - cv::circle(frame, cv::Point(xy[0],xy[1]), 5, cv::Scalar(0,0,255), -1); + auto xy = transform(neckjoint_position); + cv::circle(preview_image_, cv::Point(xy.x,xy.y), 5, cv::Scalar(0,0,255), -1); } +} - if (s.show_network_input) - { - cv::Mat netinput = poseestimator->last_network_input(); - if (!netinput.empty()) - { - const int w = std::min(netinput.cols, frame.cols); - const int h = std::min(netinput.rows, frame.rows); - cv::Rect roi(0, 0, w, h); - netinput(roi).copyTo(frame(roi)); - } - } +void Preview::overlay_netinput(const cv::Mat& netinput) +{ + if (netinput.empty()) + return; + + const int w = std::min(netinput.cols, preview_image_.cols); + const int h = std::min(netinput.rows, preview_image_.rows); + cv::Rect roi(0, 0, w, h); + netinput(roi).copyTo(preview_image_(roi)); +} +void Preview::draw_fps(double fps, double last_inference_time) +{ char buf[128]; ::snprintf(buf, sizeof(buf), "%d Hz, pose inference: %d ms", std::clamp(int(fps), 0, 9999), int(last_inference_time)); - cv::putText(frame, buf, cv::Point(10, frame.rows-10), cv::FONT_HERSHEY_PLAIN, 1, cv::Scalar(0, 255, 0), 1); + cv::putText(preview_image_, buf, cv::Point(10, preview_image_.rows-10), cv::FONT_HERSHEY_PLAIN, 1, cv::Scalar(0, 255, 0), 1); } -neuralnet_tracker::neuralnet_tracker() +void Preview::copy_to_widget(cv_video_widget& widget) +{ + if (preview_image_.rows > 0) + widget.update_image(preview_image_); +} + + +cv::Rect2f Preview::transform(const cv::Rect2f& r) const +{ + return { (r.x - offset_.x)*scale_, (r.y - offset_.y)*scale_, r.width*scale_, r.height*scale_ }; +} + +cv::Point2f Preview::transform(const cv::Point2f& p) const +{ + return { (p.x - offset_.x)*scale_ , (p.y - offset_.y)*scale_ }; +} + +float Preview::transform(float s) const +{ + return s * scale_; +} + + +NeuralNetTracker::NeuralNetTracker() { opencv_init(); - cv::setNumThreads(num_threads); } -neuralnet_tracker::~neuralnet_tracker() +NeuralNetTracker::~NeuralNetTracker() { requestInterruption(); wait(); @@ -534,21 +871,22 @@ neuralnet_tracker::~neuralnet_tracker() } -module_status neuralnet_tracker::start_tracker(QFrame* videoframe) +module_status NeuralNetTracker::start_tracker(QFrame* videoframe) { videoframe->show(); - videoWidget = std::make_unique<cv_video_widget>(videoframe); - layout = std::make_unique<QHBoxLayout>(); - layout->setContentsMargins(0, 0, 0, 0); - layout->addWidget(&*videoWidget); - videoframe->setLayout(&*layout); - videoWidget->show(); + video_widget_ = std::make_unique<cv_video_widget>(videoframe); + layout_ = std::make_unique<QHBoxLayout>(); + layout_->setContentsMargins(0, 0, 0, 0); + layout_->addWidget(&*video_widget_); + videoframe->setLayout(&*layout_); + video_widget_->show(); + num_threads_ = settings_.num_threads; start(); return status_ok(); } -bool neuralnet_tracker::load_and_initialize_model() +bool NeuralNetTracker::load_and_initialize_model() { const QString localizer_model_path_enc = OPENTRACK_BASE_PATH+"/" OPENTRACK_LIBRARY_PATH "/models/head-localizer.onnx"; @@ -557,26 +895,25 @@ bool neuralnet_tracker::load_and_initialize_model() try { - env = Ort::Env{ + env_ = Ort::Env{ OrtLoggingLevel::ORT_LOGGING_LEVEL_ERROR, "tracker-neuralnet" }; auto opts = Ort::SessionOptions{}; // Do thread settings here do anything? // There is a warning which says to control number of threads via - // openmp settings. Which is what we do. omp_set_num_threads directly - // before running the inference pass. - opts.SetIntraOpNumThreads(num_threads); - opts.SetInterOpNumThreads(num_threads); - allocator_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); - - localizer.emplace( - allocator_info, - Ort::Session{env, convert(localizer_model_path_enc).c_str(), opts}); - - poseestimator.emplace( - allocator_info, - Ort::Session{env, convert(poseestimator_model_path_enc).c_str(), opts}); + // openmp settings. Which is what we do. + opts.SetIntraOpNumThreads(num_threads_); + opts.SetInterOpNumThreads(1); + allocator_info_ = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); + + localizer_.emplace( + allocator_info_, + Ort::Session{env_, convert(localizer_model_path_enc).c_str(), opts}); + + poseestimator_.emplace( + allocator_info_, + Ort::Session{env_, convert(poseestimator_model_path_enc).c_str(), opts}); } catch (const Ort::Exception &e) { @@ -588,74 +925,87 @@ bool neuralnet_tracker::load_and_initialize_model() } -bool neuralnet_tracker::open_camera() +bool NeuralNetTracker::open_camera() { - int fps = enum_to_fps(s.force_fps); + int rint = std::clamp(*settings_.resolution, 0, (int)std::size(resolution_choices)-1); + resolution_tuple res = resolution_choices[rint]; + int fps = enum_to_fps(settings_.force_fps); - QMutexLocker l(&camera_mtx); + QMutexLocker l(&camera_mtx_); - camera = video::make_camera(s.camera_name); + camera_ = video::make_camera(settings_.camera_name); - if (!camera) + if (!camera_) return false; video::impl::camera::info args {}; - args.width = 320; - args.height = 240; - + if (res.width) + { + args.width = res.width; + args.height = res.height; + } if (fps) args.fps = fps; - args.use_mjpeg = s.use_mjpeg; + args.use_mjpeg = settings_.use_mjpeg; - if (!camera->start(args)) + if (!camera_->start(args)) { qDebug() << "neuralnet tracker: can't open camera"; return false; } + return true; } -void neuralnet_tracker::set_intrinsics() +void NeuralNetTracker::set_intrinsics() { - const int w = grayscale.cols, h = grayscale.rows; - const double diag_fov = s.fov * M_PI / 180.; + const int w = grayscale_.cols, h = grayscale_.rows; + const double diag_fov = settings_.fov * M_PI / 180.; const double fov_w = 2.*atan(tan(diag_fov/2.)/sqrt(1. + h/(double)w * h/(double)w)); const double fov_h = 2.*atan(tan(diag_fov/2.)/sqrt(1. + w/(double)h * w/(double)h)); const double focal_length_w = 1. / tan(.5 * fov_w); const double focal_length_h = 1. / tan(.5 * fov_h); - intrinsics.fov_h = fov_h; - intrinsics.fov_w = fov_w; - intrinsics.focal_length_w = focal_length_w; - intrinsics.focal_length_h = focal_length_h; + intrinsics_.fov_h = fov_h; + intrinsics_.fov_w = fov_w; + intrinsics_.focal_length_w = focal_length_w; + intrinsics_.focal_length_h = focal_length_h; } -vec3 neuralnet_tracker::image_to_world(float x, float y, float size, float real_size) const +class GuardedThreadCountSwitch { - // Compute the location the network outputs in 3d space. - const float xpos = -(intrinsics.focal_length_w * frame.cols * 0.5f) / size * real_size; - const float zpos = (x / frame.cols * 2.f - 1.f) * xpos / intrinsics.focal_length_w; - const float ypos = (y / frame.rows * 2.f - 1.f) * xpos / intrinsics.focal_length_h; - return {xpos, ypos, zpos}; -} + int old_num_threads_cv_ = 1; + int old_num_threads_omp_ = 1; + public: + GuardedThreadCountSwitch(int num_threads) + { + old_num_threads_cv_ = cv::getNumThreads(); + old_num_threads_omp_ = omp_get_num_threads(); + omp_set_num_threads(num_threads); + cv::setNumThreads(num_threads); + } + + ~GuardedThreadCountSwitch() + { + omp_set_num_threads(old_num_threads_omp_); + cv::setNumThreads(old_num_threads_cv_); + } + + GuardedThreadCountSwitch(const GuardedThreadCountSwitch&) = delete; + GuardedThreadCountSwitch& operator=(const GuardedThreadCountSwitch&) = delete; +}; -vec2 neuralnet_tracker::world_to_image(const vec3& pos) const +void NeuralNetTracker::run() { - const float xscr = pos[2] / pos[0] * intrinsics.focal_length_w; - const float yscr = pos[1] / pos[0] * intrinsics.focal_length_h; - const float x = (xscr+1.)*0.5f*frame.cols; - const float y = (yscr+1.)*0.5f*frame.rows; - return {x, y}; -} + preview_.init(*video_widget_); + GuardedThreadCountSwitch switch_num_threads_to(num_threads_); -void neuralnet_tracker::run() -{ if (!open_camera()) return; @@ -666,12 +1016,12 @@ void neuralnet_tracker::run() while (!isInterruptionRequested()) { - last_inference_time = 0; + is_visible_ = check_is_visible(); auto t = clk.now(); { - QMutexLocker l(&camera_mtx); + QMutexLocker l(&camera_mtx_); - auto [ img, res ] = camera->get_frame(); + auto [ img, res ] = camera_->get_frame(); if (!res) { @@ -680,17 +1030,24 @@ void neuralnet_tracker::run() continue; } - auto color = cv::Mat(img.height, img.width, CV_8UC(img.channels), (void*)img.data, img.stride); - color.copyTo(frame); + { + QMutexLocker lck{&stats_mtx_}; + resolution_ = { img.width, img.height }; + } + + auto color = prepare_input_image(img); + + if (is_visible_) + preview_.copy_video_frame(color); switch (img.channels) { case 1: - grayscale.create(img.height, img.width, CV_8UC1); - color.copyTo(grayscale); + grayscale_.create(img.height, img.width, CV_8UC1); + color.copyTo(grayscale_); break; case 3: - cv::cvtColor(color, grayscale, cv::COLOR_BGR2GRAY); + cv::cvtColor(color, grayscale_, cv::COLOR_BGR2GRAY); break; default: qDebug() << "Can't handle" << img.channels << "color channels"; @@ -702,8 +1059,8 @@ void neuralnet_tracker::run() detect(); - if (frame.rows > 0) - videoWidget->update_image(frame); + if (is_visible_) + preview_.copy_to_widget(*video_widget_); update_fps( std::chrono::duration_cast<std::chrono::milliseconds>( @@ -712,23 +1069,50 @@ void neuralnet_tracker::run() } -void neuralnet_tracker::update_fps(double dt) +cv::Mat NeuralNetTracker::prepare_input_image(const video::frame& frame) { - const double alpha = dt/(dt + RC); + auto img = cv::Mat(frame.height, frame.width, CV_8UC(frame.channels), (void*)frame.data, frame.stride); + // Crop if aspect ratio is not 4:3 + if (img.rows*4 != img.cols*3) + { + img = img(make_crop_rect_for_aspect(img.size(), 4, 3)); + } + + img = img(make_crop_rect_multiple_of(img.size(), 4)); + + if (img.cols > 640) + { + cv::pyrDown(img, downsized_original_images_[0]); + img = downsized_original_images_[0]; + } + if (img.cols > 640) + { + cv::pyrDown(img, downsized_original_images_[1]); + img = downsized_original_images_[1]; + } + + return img; +} + + +void NeuralNetTracker::update_fps(double dt) +{ + const double alpha = dt/(dt + RC); if (dt > 1e-6) { - fps *= 1 - alpha; - fps += alpha * 1./dt; + QMutexLocker lck{&stats_mtx_}; + fps_ *= 1 - alpha; + fps_ += alpha * 1./dt; } } -void neuralnet_tracker::data(double *data) +void NeuralNetTracker::data(double *data) { Affine tmp = [&]() { - QMutexLocker lck(&mtx); + QMutexLocker lck(&mtx_); return pose_; }(); @@ -753,113 +1137,155 @@ void neuralnet_tracker::data(double *data) } -Affine neuralnet_tracker::pose() +Affine NeuralNetTracker::pose() { - QMutexLocker lck(&mtx); + QMutexLocker lck(&mtx_); return pose_; } +std::tuple<cv::Size,double, double> NeuralNetTracker::stats() const +{ + QMutexLocker lck(&stats_mtx_); + return { resolution_, fps_, inference_time_ }; +} -void neuralnet_dialog::make_fps_combobox() +void NeuralNetDialog::make_fps_combobox() { for (int k = 0; k < fps_MAX; k++) { const int hz = enum_to_fps(k); const QString name = (hz == 0) ? tr("Default") : QString::number(hz); - ui.cameraFPS->addItem(name, k); + ui_.cameraFPS->addItem(name, k); + } +} + +void NeuralNetDialog::make_resolution_combobox() +{ + int k=0; + for (const auto [w, h] : resolution_choices) + { + const QString s = (w == 0) + ? tr("Default") + : QString::number(w) + " x " + QString::number(h); + ui_.resolution->addItem(s, k++); } } -neuralnet_dialog::neuralnet_dialog() : - trans_calib(1, 2) +NeuralNetDialog::NeuralNetDialog() : + trans_calib_(1, 2) { - ui.setupUi(this); + ui_.setupUi(this); make_fps_combobox(); - tie_setting(s.force_fps, ui.cameraFPS); + make_resolution_combobox(); for (const auto& str : video::camera_names()) - ui.cameraName->addItem(str); - - tie_setting(s.camera_name, ui.cameraName); - tie_setting(s.fov, ui.cameraFOV); - tie_setting(s.offset_fwd, ui.tx_spin); - tie_setting(s.offset_up, ui.ty_spin); - tie_setting(s.offset_right, ui.tz_spin); - tie_setting(s.show_network_input, ui.showNetworkInput); - tie_setting(s.use_mjpeg, ui.use_mjpeg); - - connect(ui.buttonBox, SIGNAL(accepted()), this, SLOT(doOK())); - connect(ui.buttonBox, SIGNAL(rejected()), this, SLOT(doCancel())); - connect(ui.camera_settings, SIGNAL(clicked()), this, SLOT(camera_settings())); - - connect(&s.camera_name, value_::value_changed<QString>(), this, &neuralnet_dialog::update_camera_settings_state); - - update_camera_settings_state(s.camera_name); - - connect(&calib_timer, &QTimer::timeout, this, &neuralnet_dialog::trans_calib_step); - calib_timer.setInterval(35); - connect(ui.tcalib_button,SIGNAL(toggled(bool)), this, SLOT(startstop_trans_calib(bool))); + ui_.cameraName->addItem(str); + + tie_setting(settings_.camera_name, ui_.cameraName); + tie_setting(settings_.fov, ui_.cameraFOV); + tie_setting(settings_.offset_fwd, ui_.tx_spin); + tie_setting(settings_.offset_up, ui_.ty_spin); + tie_setting(settings_.offset_right, ui_.tz_spin); + tie_setting(settings_.show_network_input, ui_.showNetworkInput); + tie_setting(settings_.roi_filter_alpha, ui_.roiFilterAlpha); + tie_setting(settings_.use_mjpeg, ui_.use_mjpeg); + tie_setting(settings_.roi_zoom, ui_.roiZoom); + tie_setting(settings_.num_threads, ui_.threadCount); + tie_setting(settings_.resolution, ui_.resolution); + tie_setting(settings_.force_fps, ui_.cameraFPS); + + connect(ui_.buttonBox, SIGNAL(accepted()), this, SLOT(doOK())); + connect(ui_.buttonBox, SIGNAL(rejected()), this, SLOT(doCancel())); + connect(ui_.camera_settings, SIGNAL(clicked()), this, SLOT(camera_settings())); + + connect(&settings_.camera_name, value_::value_changed<QString>(), this, &NeuralNetDialog::update_camera_settings_state); + + update_camera_settings_state(settings_.camera_name); + + connect(&calib_timer_, &QTimer::timeout, this, &NeuralNetDialog::trans_calib_step); + calib_timer_.setInterval(35); + connect(ui_.tcalib_button,SIGNAL(toggled(bool)), this, SLOT(startstop_trans_calib(bool))); + + connect(&tracker_status_poll_timer_, &QTimer::timeout, this, &NeuralNetDialog::status_poll); + tracker_status_poll_timer_.setInterval(250); + tracker_status_poll_timer_.start(); } -void neuralnet_dialog::doOK() +void NeuralNetDialog::doOK() { - s.b->save(); + settings_.b->save(); close(); } -void neuralnet_dialog::doCancel() +void NeuralNetDialog::doCancel() { close(); } -void neuralnet_dialog::camera_settings() +void NeuralNetDialog::camera_settings() { - if (tracker) + if (tracker_) { - QMutexLocker l(&tracker->camera_mtx); - (void)tracker->camera->show_dialog(); + QMutexLocker l(&tracker_->camera_mtx_); + (void)tracker_->camera_->show_dialog(); } else - (void)video::show_dialog(s.camera_name); + (void)video::show_dialog(settings_.camera_name); } -void neuralnet_dialog::update_camera_settings_state(const QString& name) +void NeuralNetDialog::update_camera_settings_state(const QString& name) { (void)name; - ui.camera_settings->setEnabled(true); + ui_.camera_settings->setEnabled(true); +} + + +void NeuralNetDialog::register_tracker(ITracker * x) +{ + tracker_ = static_cast<NeuralNetTracker*>(x); + ui_.tcalib_button->setEnabled(true); } -void neuralnet_dialog::register_tracker(ITracker * x) +void NeuralNetDialog::unregister_tracker() { - tracker = static_cast<neuralnet_tracker*>(x); - ui.tcalib_button->setEnabled(true); + tracker_ = nullptr; + ui_.tcalib_button->setEnabled(false); } -void neuralnet_dialog::unregister_tracker() +void NeuralNetDialog::status_poll() { - tracker = nullptr; - ui.tcalib_button->setEnabled(false); + QString status; + if (!tracker_) + { + status = tr("Tracker Offline"); + } + else + { + auto [ res, fps, inference_time ] = tracker_->stats(); + status = tr("%1x%2 @ %3 FPS / Inference: %4 ms").arg(res.width).arg(res.height).arg(int(fps)).arg(int(inference_time)); + } + ui_.resolution_display->setText(status); } -void neuralnet_dialog::trans_calib_step() +void NeuralNetDialog::trans_calib_step() { - if (tracker) + if (tracker_) { const Affine X_CM = [&]() { - QMutexLocker l(&calibrator_mutex); - return tracker->pose(); + QMutexLocker l(&calibrator_mutex_); + return tracker_->pose(); }(); - trans_calib.update(X_CM.R, X_CM.t); - auto [_, nsamples] = trans_calib.get_estimate(); + trans_calib_.update(X_CM.R, X_CM.t); + auto [_, nsamples] = trans_calib_.get_estimate(); constexpr int min_yaw_samples = 15; constexpr int min_pitch_samples = 12; @@ -878,52 +1304,52 @@ void neuralnet_dialog::trans_calib_step() const int nsamples_total = nsamples[0] + nsamples[1]; sample_feedback = tr("%1 samples. Over %2, good!").arg(nsamples_total).arg(min_samples); } - ui.sample_count_display->setText(sample_feedback); + ui_.sample_count_display->setText(sample_feedback); } else startstop_trans_calib(false); } -void neuralnet_dialog::startstop_trans_calib(bool start) +void NeuralNetDialog::startstop_trans_calib(bool start) { - QMutexLocker l(&calibrator_mutex); + QMutexLocker l(&calibrator_mutex_); // FIXME: does not work ... if (start) { qDebug() << "pt: starting translation calibration"; - calib_timer.start(); - trans_calib.reset(); - ui.sample_count_display->setText(QString()); + calib_timer_.start(); + trans_calib_.reset(); + ui_.sample_count_display->setText(QString()); // Tracker must run with zero'ed offset for calibration. - s.offset_fwd = 0; - s.offset_up = 0; - s.offset_right = 0; + settings_.offset_fwd = 0; + settings_.offset_up = 0; + settings_.offset_right = 0; } else { - calib_timer.stop(); + calib_timer_.stop(); qDebug() << "pt: stopping translation calibration"; { - auto [tmp, nsamples] = trans_calib.get_estimate(); - s.offset_fwd = int(tmp[0]); - s.offset_up = int(tmp[1]); - s.offset_right = int(tmp[2]); + auto [tmp, nsamples] = trans_calib_.get_estimate(); + settings_.offset_fwd = int(tmp[0]); + settings_.offset_up = int(tmp[1]); + settings_.offset_right = int(tmp[2]); } } - ui.tx_spin->setEnabled(!start); - ui.ty_spin->setEnabled(!start); - ui.tz_spin->setEnabled(!start); + ui_.tx_spin->setEnabled(!start); + ui_.ty_spin->setEnabled(!start); + ui_.tz_spin->setEnabled(!start); if (start) - ui.tcalib_button->setText(tr("Stop calibration")); + ui_.tcalib_button->setText(tr("Stop calibration")); else - ui.tcalib_button->setText(tr("Start calibration")); + ui_.tcalib_button->setText(tr("Start calibration")); } -settings::settings() : opts("neuralnet-tracker") {} +Settings::Settings() : opts("neuralnet-tracker") {} } // neuralnet_tracker_ns -OPENTRACK_DECLARE_TRACKER(neuralnet_tracker, neuralnet_dialog, neuralnet_metadata) +OPENTRACK_DECLARE_TRACKER(NeuralNetTracker, NeuralNetDialog, NeuralNetMetadata) diff --git a/tracker-neuralnet/ftnoir_tracker_neuralnet.h b/tracker-neuralnet/ftnoir_tracker_neuralnet.h index 5f9c6fbe..9b481186 100644 --- a/tracker-neuralnet/ftnoir_tracker_neuralnet.h +++ b/tracker-neuralnet/ftnoir_tracker_neuralnet.h @@ -25,6 +25,7 @@ #include <memory> #include <cinttypes> +#include <array> #include <onnxruntime_cxx_api.h> @@ -49,8 +50,25 @@ enum fps_choices fps_MAX = 3 }; +struct resolution_tuple +{ + int width; + int height; +}; + +static const std::array<resolution_tuple, 7> resolution_choices = +{{ + { 320, 240 }, + { 640, 480 }, + { 800, 600 }, + { 1024, 768 }, + { 1280, 720 }, + { 1920, 1080}, + { 0, 0 } +}}; -struct settings : opts { + +struct Settings : opts { value<int> offset_fwd { b, "offset-fwd", 200 }, // Millimeters offset_up { b, "offset-up", 0 }, offset_right { b, "offset-right", 0 }; @@ -58,8 +76,12 @@ struct settings : opts { value<int> fov { b, "field-of-view", 56 }; value<fps_choices> force_fps { b, "force-fps", fps_default }; value<bool> show_network_input { b, "show-network-input", false }; + value<double> roi_filter_alpha{ b, "roi-filter-alpha", 1. }; + value<double> roi_zoom{ b, "roi-zoom", 1. }; value<bool> use_mjpeg { b, "use-mjpeg", false }; - settings(); + value<int> num_threads { b, "num-threads", 1 }; + value<int> resolution { b, "force-resolution", 0 }; + Settings(); }; @@ -85,14 +107,14 @@ class Localizer double last_inference_time_millis() const; private: - inline static constexpr int input_img_width = 288; - inline static constexpr int input_img_height = 224; - Ort::Session session{nullptr}; + inline static constexpr int INPUT_IMG_WIDTH = 288; + inline static constexpr int INPUT_IMG_HEIGHT = 224; + Ort::Session session_{nullptr}; // Inputs / outputs - cv::Mat scaled_frame{}, input_mat{}; - Ort::Value input_val{nullptr}, output_val{nullptr}; - std::array<float, 5> results; - double last_inference_time = 0; + cv::Mat scaled_frame_{}, input_mat_{}; + Ort::Value input_val_{nullptr}, output_val_{nullptr}; + std::array<float, 5> results_; + double last_inference_time_ = 0; }; @@ -102,7 +124,6 @@ class PoseEstimator struct Face { std::array<float,4> rotation; // Quaternion, (w, x, y, z) - // The following quantities are defined wrt the image space of the input cv::Rect2f box; cv::Point2f center; float size; @@ -110,7 +131,11 @@ class PoseEstimator PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&session); - // Inference + /** Inference + * + * Coordinates are defined wrt. the image space of the input `frame`. + * X goes right, Z (depth) into the image, Y points down (like pixel coordinates values increase from top to bottom) + */ std::optional<Face> run(const cv::Mat &frame, const cv::Rect &box); // Returns an image compatible with the 'frame' image for displaying. cv::Mat last_network_input() const; @@ -118,96 +143,129 @@ class PoseEstimator private: // Operates on the private image data members int find_input_intensity_90_pct_quantile() const; - inline static constexpr int input_img_width = 129; - inline static constexpr int input_img_height = 129; - Ort::Session session{nullptr}; + + int64_t model_version_ = 0; // Queried meta data from the ONNX file + Ort::Session session_{nullptr}; // ONNX's runtime context for running the model + Ort::Allocator allocator_; // Memory allocator for tensors // Inputs - cv::Mat scaled_frame{}, input_mat{}; - Ort::Value input_val{nullptr}; + cv::Mat scaled_frame_{}, input_mat_{}; // Input. One is the original crop, the other is rescaled (?) + std::vector<Ort::Value> input_val_; // Tensors to put into the model + std::vector<const char*> input_names_; // Refers to the names in the onnx model. // Outputs - cv::Vec<float, 3> output_coord{}; - cv::Vec<float, 4> output_quat{}; - cv::Vec<float, 4> output_box{}; - Ort::Value output_val[3] = { - Ort::Value{nullptr}, - Ort::Value{nullptr}, - Ort::Value{nullptr}}; - double last_inference_time = 0; + cv::Vec<float, 3> output_coord_{}; // 2d Coordinate and head size output. + cv::Vec<float, 4> output_quat_{}; // Quaternion output + cv::Vec<float, 4> output_box_{}; // Bounding box output + std::vector<Ort::Value> output_val_; // Tensors to put the model outputs in. + std::vector<const char*> output_names_; // Refers to the names in the onnx model. + size_t num_recurrent_states_ = 0; + double last_inference_time_ = 0; +}; + + +class Preview +{ +public: + void init(const cv_video_widget& widget); + void copy_video_frame(const cv::Mat& frame); + void draw_gizmos( + const std::optional<PoseEstimator::Face> &face, + const Affine& pose, + const std::optional<cv::Rect2f>& last_roi, + const std::optional<cv::Rect2f>& last_localizer_roi, + const cv::Point2f& neckjoint_position); + void overlay_netinput(const cv::Mat& netinput); + void draw_fps(double fps, double last_inference_time); + void copy_to_widget(cv_video_widget& widget); +private: + // Transform from camera frame to preview + cv::Rect2f transform(const cv::Rect2f& r) const; + cv::Point2f transform(const cv::Point2f& p) const; + float transform(float s) const; + + cv::Mat preview_image_; + cv::Size preview_size_ = { 0, 0 }; + float scale_ = 1.f; + cv::Point2f offset_ = { 0.f, 0.f}; }; -class neuralnet_tracker : protected virtual QThread, public ITracker +class NeuralNetTracker : protected virtual QThread, public ITracker { Q_OBJECT public: - neuralnet_tracker(); - ~neuralnet_tracker() override; + NeuralNetTracker(); + ~NeuralNetTracker() override; module_status start_tracker(QFrame* frame) override; void data(double *data) override; void run() override; Affine pose(); + std::tuple<cv::Size, double, double> stats() const; - QMutex camera_mtx; - std::unique_ptr<video::impl::camera> camera; + QMutex camera_mtx_; + std::unique_ptr<video::impl::camera> camera_; private: bool detect(); bool open_camera(); void set_intrinsics(); + cv::Mat prepare_input_image(const video::frame& frame); bool load_and_initialize_model(); void draw_gizmos( - cv::Mat frame, const std::optional<PoseEstimator::Face> &face, - const Affine& pose) const; + const Affine& pose); void update_fps(double dt); - Affine compute_pose(const PoseEstimator::Face &face) const; - numeric_types::vec3 image_to_world(float x, float y, float size, float real_size) const; - numeric_types::vec2 world_to_image(const numeric_types::vec3& p) const; - - settings s; - std::optional<Localizer> localizer; - std::optional<PoseEstimator> poseestimator; - Ort::Env env{nullptr}; - Ort::MemoryInfo allocator_info{nullptr}; - - CamIntrinsics intrinsics{}; - cv::Mat frame, grayscale; - std::optional<cv::Rect2f> last_localizer_roi; - std::optional<cv::Rect2f> last_roi; - static constexpr float head_size_mm = 200.f; - - double fps = 0; - double last_inference_time = 0; + + Settings settings_; + std::optional<Localizer> localizer_; + std::optional<PoseEstimator> poseestimator_; + Ort::Env env_{nullptr}; + Ort::MemoryInfo allocator_info_{nullptr}; + + CamIntrinsics intrinsics_{}; + cv::Mat grayscale_; + std::array<cv::Mat,2> downsized_original_images_ = {}; // Image pyramid + std::optional<cv::Rect2f> last_localizer_roi_; + std::optional<cv::Rect2f> last_roi_; + static constexpr float HEAD_SIZE_MM = 200.f; + + mutable QMutex stats_mtx_; + double fps_ = 0; + double inference_time_ = 0; + cv::Size resolution_ = {}; + static constexpr double RC = .25; + int num_threads_ = 1; + bool is_visible_ = true; - QMutex mtx; // Protects the pose + QMutex mtx_; // Protects the pose Affine pose_; - std::unique_ptr<cv_video_widget> videoWidget; - std::unique_ptr<QHBoxLayout> layout; + Preview preview_; + std::unique_ptr<cv_video_widget> video_widget_; + std::unique_ptr<QHBoxLayout> layout_; }; -class neuralnet_dialog : public ITrackerDialog +class NeuralNetDialog : public ITrackerDialog { Q_OBJECT public: - neuralnet_dialog(); + NeuralNetDialog(); void register_tracker(ITracker * x) override; void unregister_tracker() override; private: void make_fps_combobox(); + void make_resolution_combobox(); - Ui::Form ui; - settings s; - + Ui::Form ui_; + Settings settings_; // Calibration code mostly taken from point tracker - QTimer calib_timer; - TranslationCalibrator trans_calib; - QMutex calibrator_mutex; - - neuralnet_tracker* tracker = nullptr; + QTimer calib_timer_; + TranslationCalibrator trans_calib_; + QMutex calibrator_mutex_; + QTimer tracker_status_poll_timer_; + NeuralNetTracker* tracker_ = nullptr; private Q_SLOTS: void doOK(); @@ -216,10 +274,11 @@ private Q_SLOTS: void update_camera_settings_state(const QString& name); void startstop_trans_calib(bool start); void trans_calib_step(); + void status_poll(); }; -class neuralnet_metadata : public Metadata +class NeuralNetMetadata : public Metadata { Q_OBJECT QString name() override { return QString("neuralnet tracker"); } @@ -229,6 +288,6 @@ class neuralnet_metadata : public Metadata } // neuralnet_tracker_ns -using neuralnet_tracker_ns::neuralnet_tracker; -using neuralnet_tracker_ns::neuralnet_dialog; -using neuralnet_tracker_ns::neuralnet_metadata; +using neuralnet_tracker_ns::NeuralNetTracker; +using neuralnet_tracker_ns::NeuralNetDialog; +using neuralnet_tracker_ns::NeuralNetMetadata; diff --git a/tracker-neuralnet/lang/nl_NL.ts b/tracker-neuralnet/lang/nl_NL.ts index a2dcd958..dbcd3c8c 100644 --- a/tracker-neuralnet/lang/nl_NL.ts +++ b/tracker-neuralnet/lang/nl_NL.ts @@ -64,14 +64,70 @@ Don't roll or change position.</source> <source>MJPEG</source> <translation type="unfinished"></translation> </message> + <message> + <source>Tuning / Debug</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>ROI Smoothing Alpha</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>ROI Zoom</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Thread Count</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Resolution</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Field of view. Needed to transform the pose to world coordinates.</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Requested video frame rate. Actual setting may not be supported by the camera.</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>The requested resolution for cases where the camera delivers maximum frame rate only for a particular resolution. The image may still be downscaled to the internal resolution.</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Number of threads. Can be used to balance the CPU load between the game and the tracker.</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Show the image patch that the pose estimation model sees.</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Amount of smoothing of the face region coordinates. Can help stabilize the pose.</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Zoom factor for the face region. Applied before the patch is fed into the pose estimation model. There is a sweet spot near 1.</source> + <translation type="unfinished"></translation> + </message> </context> <context> - <name>neuralnet_tracker_ns::neuralnet_dialog</name> + <name>neuralnet_tracker_ns::NeuralNetDialog</name> <message> <source>Default</source> <translation type="unfinished">Standaard</translation> </message> <message> + <source>Tracker Offline</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>%1x%2 @ %3 FPS / Inference: %4 ms</source> + <translation type="unfinished"></translation> + </message> + <message> <source>%1 yaw samples. Yaw more to %2 samples for stable calibration.</source> <translation type="unfinished"></translation> </message> diff --git a/tracker-neuralnet/lang/ru_RU.ts b/tracker-neuralnet/lang/ru_RU.ts index 7e8a9c09..a80c7e3d 100644 --- a/tracker-neuralnet/lang/ru_RU.ts +++ b/tracker-neuralnet/lang/ru_RU.ts @@ -64,14 +64,70 @@ Don't roll or change position.</source> <source>MJPEG</source> <translation type="unfinished"></translation> </message> + <message> + <source>Tuning / Debug</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>ROI Smoothing Alpha</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>ROI Zoom</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Thread Count</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Resolution</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Field of view. Needed to transform the pose to world coordinates.</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Requested video frame rate. Actual setting may not be supported by the camera.</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>The requested resolution for cases where the camera delivers maximum frame rate only for a particular resolution. The image may still be downscaled to the internal resolution.</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Number of threads. Can be used to balance the CPU load between the game and the tracker.</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Show the image patch that the pose estimation model sees.</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Amount of smoothing of the face region coordinates. Can help stabilize the pose.</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Zoom factor for the face region. Applied before the patch is fed into the pose estimation model. There is a sweet spot near 1.</source> + <translation type="unfinished"></translation> + </message> </context> <context> - <name>neuralnet_tracker_ns::neuralnet_dialog</name> + <name>neuralnet_tracker_ns::NeuralNetDialog</name> <message> <source>Default</source> <translation type="unfinished"></translation> </message> <message> + <source>Tracker Offline</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>%1x%2 @ %3 FPS / Inference: %4 ms</source> + <translation type="unfinished"></translation> + </message> + <message> <source>%1 yaw samples. Yaw more to %2 samples for stable calibration.</source> <translation type="unfinished"></translation> </message> diff --git a/tracker-neuralnet/lang/stub.ts b/tracker-neuralnet/lang/stub.ts index 66e1695c..4cde86a9 100644 --- a/tracker-neuralnet/lang/stub.ts +++ b/tracker-neuralnet/lang/stub.ts @@ -64,14 +64,70 @@ Don't roll or change position.</source> <source>MJPEG</source> <translation type="unfinished"></translation> </message> + <message> + <source>Tuning / Debug</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>ROI Smoothing Alpha</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>ROI Zoom</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Thread Count</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Resolution</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Field of view. Needed to transform the pose to world coordinates.</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Requested video frame rate. Actual setting may not be supported by the camera.</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>The requested resolution for cases where the camera delivers maximum frame rate only for a particular resolution. The image may still be downscaled to the internal resolution.</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Number of threads. Can be used to balance the CPU load between the game and the tracker.</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Show the image patch that the pose estimation model sees.</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Amount of smoothing of the face region coordinates. Can help stabilize the pose.</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Zoom factor for the face region. Applied before the patch is fed into the pose estimation model. There is a sweet spot near 1.</source> + <translation type="unfinished"></translation> + </message> </context> <context> - <name>neuralnet_tracker_ns::neuralnet_dialog</name> + <name>neuralnet_tracker_ns::NeuralNetDialog</name> <message> <source>Default</source> <translation type="unfinished"></translation> </message> <message> + <source>Tracker Offline</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>%1x%2 @ %3 FPS / Inference: %4 ms</source> + <translation type="unfinished"></translation> + </message> + <message> <source>%1 yaw samples. Yaw more to %2 samples for stable calibration.</source> <translation type="unfinished"></translation> </message> diff --git a/tracker-neuralnet/lang/zh_CN.ts b/tracker-neuralnet/lang/zh_CN.ts index 8e9513b3..c3a91211 100644 --- a/tracker-neuralnet/lang/zh_CN.ts +++ b/tracker-neuralnet/lang/zh_CN.ts @@ -64,14 +64,70 @@ Don't roll or change position.</source> <source>MJPEG</source> <translation type="unfinished"></translation> </message> + <message> + <source>Tuning / Debug</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>ROI Smoothing Alpha</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>ROI Zoom</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Thread Count</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Resolution</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Field of view. Needed to transform the pose to world coordinates.</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Requested video frame rate. Actual setting may not be supported by the camera.</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>The requested resolution for cases where the camera delivers maximum frame rate only for a particular resolution. The image may still be downscaled to the internal resolution.</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Number of threads. Can be used to balance the CPU load between the game and the tracker.</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Show the image patch that the pose estimation model sees.</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Amount of smoothing of the face region coordinates. Can help stabilize the pose.</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>Zoom factor for the face region. Applied before the patch is fed into the pose estimation model. There is a sweet spot near 1.</source> + <translation type="unfinished"></translation> + </message> </context> <context> - <name>neuralnet_tracker_ns::neuralnet_dialog</name> + <name>neuralnet_tracker_ns::NeuralNetDialog</name> <message> <source>Default</source> <translation type="unfinished"></translation> </message> <message> + <source>Tracker Offline</source> + <translation type="unfinished"></translation> + </message> + <message> + <source>%1x%2 @ %3 FPS / Inference: %4 ms</source> + <translation type="unfinished"></translation> + </message> + <message> <source>%1 yaw samples. Yaw more to %2 samples for stable calibration.</source> <translation type="unfinished"></translation> </message> diff --git a/tracker-neuralnet/neuralnet-trackercontrols.ui b/tracker-neuralnet/neuralnet-trackercontrols.ui index 5f72a809..750e6ef3 100644 --- a/tracker-neuralnet/neuralnet-trackercontrols.ui +++ b/tracker-neuralnet/neuralnet-trackercontrols.ui @@ -9,353 +9,602 @@ <rect> <x>0</x> <y>0</y> - <width>727</width> - <height>202</height> + <width>671</width> + <height>357</height> </rect> </property> <property name="windowTitle"> <string>Tracker settings</string> </property> <layout class="QGridLayout" name="gridLayout"> - <item row="5" column="0"> - <widget class="QDialogButtonBox" name="buttonBox"> - <property name="standardButtons"> - <set>QDialogButtonBox::Cancel|QDialogButtonBox::Ok</set> + <item row="2" column="0"> + <widget class="QGroupBox" name="groupBox"> + <property name="sizePolicy"> + <sizepolicy hsizetype="Preferred" vsizetype="Preferred"> + <horstretch>0</horstretch> + <verstretch>0</verstretch> + </sizepolicy> </property> - </widget> - </item> - <item row="3" column="0"> - <widget class="QFrame" name="frame_3"> - <property name="frameShape"> - <enum>QFrame::StyledPanel</enum> + <property name="autoFillBackground"> + <bool>true</bool> </property> - <property name="frameShadow"> - <enum>QFrame::Raised</enum> + <property name="title"> + <string>Camera Configuration</string> + </property> + <property name="flat"> + <bool>false</bool> + </property> + <property name="checkable"> + <bool>false</bool> </property> <layout class="QHBoxLayout" name="horizontalLayout"> <property name="spacing"> - <number>0</number> - </property> - <property name="leftMargin"> - <number>0</number> - </property> - <property name="topMargin"> - <number>0</number> - </property> - <property name="rightMargin"> - <number>0</number> + <number>10</number> </property> <property name="bottomMargin"> - <number>0</number> + <number>8</number> </property> <item> - <widget class="QGroupBox" name="groupBox"> + <layout class="QGridLayout" name="gridLayout_3"> + <property name="sizeConstraint"> + <enum>QLayout::SetDefaultConstraint</enum> + </property> + <property name="leftMargin"> + <number>0</number> + </property> + <property name="topMargin"> + <number>0</number> + </property> + <property name="rightMargin"> + <number>0</number> + </property> + <property name="bottomMargin"> + <number>0</number> + </property> + <property name="horizontalSpacing"> + <number>0</number> + </property> + <property name="verticalSpacing"> + <number>2</number> + </property> + <item row="0" column="1"> + <widget class="QComboBox" name="cameraName"> + <property name="sizePolicy"> + <sizepolicy hsizetype="Preferred" vsizetype="Preferred"> + <horstretch>0</horstretch> + <verstretch>0</verstretch> + </sizepolicy> + </property> + </widget> + </item> + <item row="1" column="0"> + <widget class="QLabel" name="label_9"> + <property name="text"> + <string>Diagonal FOV</string> + </property> + </widget> + </item> + <item row="0" column="0"> + <widget class="QLabel" name="label_10"> + <property name="text"> + <string>Camera name</string> + </property> + </widget> + </item> + <item row="1" column="1"> + <widget class="QSpinBox" name="cameraFOV"> + <property name="sizePolicy"> + <sizepolicy hsizetype="Preferred" vsizetype="Preferred"> + <horstretch>0</horstretch> + <verstretch>0</verstretch> + </sizepolicy> + </property> + <property name="toolTip"> + <string>Field of view. Needed to transform the pose to world coordinates.</string> + </property> + <property name="locale"> + <locale language="English" country="UnitedStates"/> + </property> + <property name="minimum"> + <number>35</number> + </property> + <property name="maximum"> + <number>90</number> + </property> + </widget> + </item> + <item row="2" column="1"> + <widget class="QComboBox" name="resolution"> + <property name="sizePolicy"> + <sizepolicy hsizetype="Preferred" vsizetype="Preferred"> + <horstretch>0</horstretch> + <verstretch>0</verstretch> + </sizepolicy> + </property> + <property name="toolTip"> + <string>The requested resolution for cases where the camera delivers maximum frame rate only for a particular resolution. The image may still be downscaled to the internal resolution.</string> + </property> + </widget> + </item> + <item row="2" column="0"> + <widget class="QLabel" name="resolution_label"> + <property name="text"> + <string>Resolution</string> + </property> + </widget> + </item> + </layout> + </item> + <item> + <layout class="QGridLayout" name="gridLayout_6"> + <property name="leftMargin"> + <number>0</number> + </property> + <property name="topMargin"> + <number>0</number> + </property> + <property name="rightMargin"> + <number>0</number> + </property> + <property name="bottomMargin"> + <number>0</number> + </property> + <property name="horizontalSpacing"> + <number>0</number> + </property> + <property name="verticalSpacing"> + <number>2</number> + </property> + <item row="4" column="1"> + <widget class="QComboBox" name="cameraFPS"> + <property name="sizePolicy"> + <sizepolicy hsizetype="Preferred" vsizetype="Preferred"> + <horstretch>0</horstretch> + <verstretch>0</verstretch> + </sizepolicy> + </property> + <property name="toolTip"> + <string>Requested video frame rate. Actual setting may not be supported by the camera.</string> + </property> + </widget> + </item> + <item row="4" column="0"> + <widget class="QLabel" name="label_12"> + <property name="text"> + <string>Frames per second</string> + </property> + </widget> + </item> + <item row="2" column="1"> + <widget class="QCheckBox" name="use_mjpeg"> + <property name="sizePolicy"> + <sizepolicy hsizetype="Minimum" vsizetype="Preferred"> + <horstretch>0</horstretch> + <verstretch>0</verstretch> + </sizepolicy> + </property> + <property name="minimumSize"> + <size> + <width>0</width> + <height>0</height> + </size> + </property> + <property name="text"> + <string/> + </property> + </widget> + </item> + <item row="2" column="0"> + <widget class="QLabel" name="label_11"> + <property name="text"> + <string>MJPEG</string> + </property> + </widget> + </item> + <item row="0" column="1"> + <widget class="QPushButton" name="camera_settings"> + <property name="sizePolicy"> + <sizepolicy hsizetype="Preferred" vsizetype="Preferred"> + <horstretch>0</horstretch> + <verstretch>0</verstretch> + </sizepolicy> + </property> + <property name="text"> + <string>Camera settings</string> + </property> + </widget> + </item> + </layout> + </item> + </layout> + </widget> + </item> + <item row="9" column="0"> + <widget class="QDialogButtonBox" name="buttonBox"> + <property name="standardButtons"> + <set>QDialogButtonBox::Cancel|QDialogButtonBox::Ok</set> + </property> + </widget> + </item> + <item row="5" column="0"> + <widget class="QGroupBox" name="tuningOptionsBox"> + <property name="sizePolicy"> + <sizepolicy hsizetype="Preferred" vsizetype="Preferred"> + <horstretch>0</horstretch> + <verstretch>0</verstretch> + </sizepolicy> + </property> + <property name="minimumSize"> + <size> + <width>0</width> + <height>0</height> + </size> + </property> + <property name="autoFillBackground"> + <bool>true</bool> + </property> + <property name="title"> + <string>Tuning / Debug</string> + </property> + <layout class="QGridLayout" name="gridLayout_2"> + <item row="0" column="10"> + <widget class="Line" name="line_2"> + <property name="orientation"> + <enum>Qt::Vertical</enum> + </property> + </widget> + </item> + <item row="0" column="1"> + <widget class="QSpinBox" name="threadCount"> + <property name="toolTip"> + <string>Number of threads. Can be used to balance the CPU load between the game and the tracker.</string> + </property> + <property name="minimum"> + <number>1</number> + </property> + <property name="maximum"> + <number>32</number> + </property> + </widget> + </item> + <item row="0" column="4"> + <widget class="Line" name="line"> + <property name="orientation"> + <enum>Qt::Vertical</enum> + </property> + </widget> + </item> + <item row="0" column="8"> + <widget class="QLabel" name="roiFilterAlphaLabel"> <property name="sizePolicy"> - <sizepolicy hsizetype="Fixed" vsizetype="Preferred"> + <sizepolicy hsizetype="Minimum" vsizetype="Minimum"> <horstretch>0</horstretch> <verstretch>0</verstretch> </sizepolicy> </property> - <property name="title"> - <string>Camera Configuration</string> + <property name="text"> + <string>ROI Smoothing Alpha</string> </property> - <layout class="QGridLayout" name="gridLayout_4"> - <item row="3" column="1"> - <widget class="QComboBox" name="cameraName"> - <property name="sizePolicy"> - <sizepolicy hsizetype="Preferred" vsizetype="Preferred"> - <horstretch>0</horstretch> - <verstretch>0</verstretch> - </sizepolicy> - </property> - </widget> - </item> - <item row="0" column="1"> - <widget class="QSpinBox" name="cameraFOV"> - <property name="sizePolicy"> - <sizepolicy hsizetype="Preferred" vsizetype="Preferred"> - <horstretch>0</horstretch> - <verstretch>0</verstretch> - </sizepolicy> + </widget> + </item> + <item row="0" column="11"> + <widget class="QLabel" name="roiZoomLabel"> + <property name="text"> + <string>ROI Zoom</string> + </property> + </widget> + </item> + <item row="0" column="2"> + <widget class="Line" name="line_3"> + <property name="orientation"> + <enum>Qt::Vertical</enum> + </property> + </widget> + </item> + <item row="0" column="3"> + <widget class="QCheckBox" name="showNetworkInput"> + <property name="sizePolicy"> + <sizepolicy hsizetype="Minimum" vsizetype="Fixed"> + <horstretch>0</horstretch> + <verstretch>0</verstretch> + </sizepolicy> + </property> + <property name="toolTip"> + <string>Show the image patch that the pose estimation model sees.</string> + </property> + <property name="text"> + <string>Show Network Input</string> + </property> + </widget> + </item> + <item row="0" column="9"> + <widget class="QDoubleSpinBox" name="roiFilterAlpha"> + <property name="sizePolicy"> + <sizepolicy hsizetype="Minimum" vsizetype="Fixed"> + <horstretch>0</horstretch> + <verstretch>0</verstretch> + </sizepolicy> + </property> + <property name="maximumSize"> + <size> + <width>150</width> + <height>16777215</height> + </size> + </property> + <property name="toolTip"> + <string>Amount of smoothing of the face region coordinates. Can help stabilize the pose.</string> + </property> + <property name="wrapping"> + <bool>false</bool> + </property> + <property name="decimals"> + <number>2</number> + </property> + <property name="maximum"> + <double>1.000000000000000</double> + </property> + <property name="singleStep"> + <double>0.010000000000000</double> + </property> + <property name="value"> + <double>1.000000000000000</double> + </property> + </widget> + </item> + <item row="0" column="0"> + <widget class="QLabel" name="threadCountLabel"> + <property name="text"> + <string>Thread Count</string> + </property> + </widget> + </item> + <item row="0" column="12"> + <widget class="QDoubleSpinBox" name="roiZoom"> + <property name="toolTip"> + <string>Zoom factor for the face region. Applied before the patch is fed into the pose estimation model. There is a sweet spot near 1.</string> + </property> + <property name="minimum"> + <double>0.100000000000000</double> + </property> + <property name="maximum"> + <double>2.000000000000000</double> + </property> + <property name="singleStep"> + <double>0.010000000000000</double> + </property> + <property name="value"> + <double>1.000000000000000</double> + </property> + </widget> + </item> + <item row="0" column="13"> + <spacer name="horizontalSpacer"> + <property name="orientation"> + <enum>Qt::Horizontal</enum> + </property> + <property name="sizeHint" stdset="0"> + <size> + <width>40</width> + <height>20</height> + </size> + </property> + </spacer> + </item> + </layout> + </widget> + </item> + <item row="4" column="0"> + <widget class="QGroupBox" name="groupBox_10"> + <property name="sizePolicy"> + <sizepolicy hsizetype="Preferred" vsizetype="Preferred"> + <horstretch>0</horstretch> + <verstretch>0</verstretch> + </sizepolicy> + </property> + <property name="autoFillBackground"> + <bool>true</bool> + </property> + <property name="title"> + <string>Head Center Offset</string> + </property> + <layout class="QGridLayout" name="gridLayout_5"> + <item row="0" column="0"> + <widget class="QFrame" name="frame_4"> + <property name="sizePolicy"> + <sizepolicy hsizetype="Preferred" vsizetype="Preferred"> + <horstretch>0</horstretch> + <verstretch>0</verstretch> + </sizepolicy> + </property> + <property name="maximumSize"> + <size> + <width>16777215</width> + <height>16777215</height> + </size> + </property> + <property name="frameShape"> + <enum>QFrame::NoFrame</enum> + </property> + <property name="frameShadow"> + <enum>QFrame::Raised</enum> + </property> + <layout class="QGridLayout" name="gridLayout_11"> + <property name="sizeConstraint"> + <enum>QLayout::SetDefaultConstraint</enum> + </property> + <property name="verticalSpacing"> + <number>0</number> + </property> + <item row="1" column="1"> + <widget class="QSpinBox" name="ty_spin"> + <property name="maximumSize"> + <size> + <width>150</width> + <height>16777215</height> + </size> </property> - <property name="locale"> - <locale language="English" country="UnitedStates"/> + <property name="suffix"> + <string> mm</string> </property> <property name="minimum"> - <number>35</number> + <number>-65535</number> </property> <property name="maximum"> - <number>90</number> - </property> - </widget> - </item> - <item row="1" column="0"> - <widget class="QLabel" name="label_12"> - <property name="text"> - <string>Frames per second</string> + <number>65536</number> </property> </widget> </item> <item row="2" column="0"> - <widget class="QLabel" name="label_11"> - <property name="text"> - <string>MJPEG</string> + <widget class="QLabel" name="label_66"> + <property name="sizePolicy"> + <sizepolicy hsizetype="Maximum" vsizetype="Preferred"> + <horstretch>0</horstretch> + <verstretch>0</verstretch> + </sizepolicy> </property> - </widget> - </item> - <item row="3" column="0"> - <widget class="QLabel" name="label_10"> <property name="text"> - <string>Camera name</string> + <string>Right</string> </property> </widget> </item> - <item row="0" column="0"> - <widget class="QLabel" name="label_9"> - <property name="text"> - <string>Diagonal FOV</string> + <item row="2" column="1"> + <widget class="QSpinBox" name="tz_spin"> + <property name="maximumSize"> + <size> + <width>150</width> + <height>16777215</height> + </size> + </property> + <property name="suffix"> + <string> mm</string> + </property> + <property name="minimum"> + <number>-65535</number> + </property> + <property name="maximum"> + <number>65536</number> </property> </widget> </item> - <item row="1" column="1"> - <widget class="QComboBox" name="cameraFPS"> + <item row="0" column="0"> + <widget class="QLabel" name="label_61"> <property name="sizePolicy"> - <sizepolicy hsizetype="Preferred" vsizetype="Preferred"> + <sizepolicy hsizetype="Maximum" vsizetype="Preferred"> <horstretch>0</horstretch> <verstretch>0</verstretch> </sizepolicy> </property> + <property name="text"> + <string>Forward</string> + </property> </widget> </item> - <item row="2" column="1"> - <widget class="QCheckBox" name="use_mjpeg"> - <property name="sizePolicy"> - <sizepolicy hsizetype="Minimum" vsizetype="Maximum"> - <horstretch>0</horstretch> - <verstretch>0</verstretch> - </sizepolicy> + <item row="0" column="1"> + <widget class="QSpinBox" name="tx_spin"> + <property name="maximumSize"> + <size> + <width>150</width> + <height>16777215</height> + </size> </property> - <property name="text"> - <string/> + <property name="suffix"> + <string> mm</string> + </property> + <property name="minimum"> + <number>-65535</number> + </property> + <property name="maximum"> + <number>65536</number> </property> </widget> </item> - <item row="4" column="1"> - <widget class="QPushButton" name="camera_settings"> + <item row="1" column="0"> + <widget class="QLabel" name="label_62"> <property name="sizePolicy"> - <sizepolicy hsizetype="Preferred" vsizetype="Maximum"> + <sizepolicy hsizetype="Maximum" vsizetype="Preferred"> <horstretch>0</horstretch> <verstretch>0</verstretch> </sizepolicy> </property> <property name="text"> - <string>Camera settings</string> + <string>Up</string> </property> </widget> </item> </layout> </widget> </item> - <item> - <widget class="QGroupBox" name="groupBox_10"> + <item row="0" column="1"> + <widget class="QFrame" name="frame_5"> <property name="sizePolicy"> - <sizepolicy hsizetype="Preferred" vsizetype="Preferred"> + <sizepolicy hsizetype="Preferred" vsizetype="Expanding"> <horstretch>0</horstretch> <verstretch>0</verstretch> </sizepolicy> </property> - <property name="title"> - <string>Head Center Offset</string> + <property name="minimumSize"> + <size> + <width>260</width> + <height>0</height> + </size> </property> - <layout class="QGridLayout" name="gridLayout_5"> - <item row="0" column="0"> - <widget class="QFrame" name="frame_4"> - <property name="sizePolicy"> - <sizepolicy hsizetype="Preferred" vsizetype="Preferred"> - <horstretch>0</horstretch> - <verstretch>0</verstretch> - </sizepolicy> + <property name="frameShape"> + <enum>QFrame::NoFrame</enum> + </property> + <property name="frameShadow"> + <enum>QFrame::Raised</enum> + </property> + <layout class="QVBoxLayout" name="verticalLayout_2"> + <item> + <widget class="QLabel" name="label_59"> + <property name="text"> + <string>Use only yaw and pitch while calibrating. +Don't roll or change position.</string> </property> - <property name="maximumSize"> - <size> - <width>16777215</width> - <height>16777215</height> - </size> + <property name="alignment"> + <set>Qt::AlignCenter</set> </property> - <property name="frameShape"> - <enum>QFrame::NoFrame</enum> + <property name="wordWrap"> + <bool>true</bool> </property> - <property name="frameShadow"> - <enum>QFrame::Raised</enum> + <property name="openExternalLinks"> + <bool>false</bool> </property> - <layout class="QGridLayout" name="gridLayout_11"> - <item row="1" column="1"> - <widget class="QSpinBox" name="ty_spin"> - <property name="maximumSize"> - <size> - <width>150</width> - <height>16777215</height> - </size> - </property> - <property name="suffix"> - <string> mm</string> - </property> - <property name="minimum"> - <number>-65535</number> - </property> - <property name="maximum"> - <number>65536</number> - </property> - </widget> - </item> - <item row="2" column="0"> - <widget class="QLabel" name="label_66"> - <property name="sizePolicy"> - <sizepolicy hsizetype="Maximum" vsizetype="Preferred"> - <horstretch>0</horstretch> - <verstretch>0</verstretch> - </sizepolicy> - </property> - <property name="text"> - <string>Right</string> - </property> - </widget> - </item> - <item row="2" column="1"> - <widget class="QSpinBox" name="tz_spin"> - <property name="maximumSize"> - <size> - <width>150</width> - <height>16777215</height> - </size> - </property> - <property name="suffix"> - <string> mm</string> - </property> - <property name="minimum"> - <number>-65535</number> - </property> - <property name="maximum"> - <number>65536</number> - </property> - </widget> - </item> - <item row="0" column="0"> - <widget class="QLabel" name="label_61"> - <property name="sizePolicy"> - <sizepolicy hsizetype="Maximum" vsizetype="Preferred"> - <horstretch>0</horstretch> - <verstretch>0</verstretch> - </sizepolicy> - </property> - <property name="text"> - <string>Forward</string> - </property> - </widget> - </item> - <item row="0" column="1"> - <widget class="QSpinBox" name="tx_spin"> - <property name="maximumSize"> - <size> - <width>150</width> - <height>16777215</height> - </size> - </property> - <property name="suffix"> - <string> mm</string> - </property> - <property name="minimum"> - <number>-65535</number> - </property> - <property name="maximum"> - <number>65536</number> - </property> - </widget> - </item> - <item row="1" column="0"> - <widget class="QLabel" name="label_62"> - <property name="sizePolicy"> - <sizepolicy hsizetype="Maximum" vsizetype="Preferred"> - <horstretch>0</horstretch> - <verstretch>0</verstretch> - </sizepolicy> - </property> - <property name="text"> - <string>Up</string> - </property> - </widget> - </item> - </layout> </widget> </item> - <item row="0" column="1"> - <widget class="QFrame" name="frame_5"> + <item> + <widget class="QLabel" name="sample_count_display"> <property name="sizePolicy"> - <sizepolicy hsizetype="Preferred" vsizetype="Expanding"> + <sizepolicy hsizetype="Minimum" vsizetype="Maximum"> <horstretch>0</horstretch> <verstretch>0</verstretch> </sizepolicy> </property> - <property name="minimumSize"> - <size> - <width>260</width> - <height>0</height> - </size> - </property> <property name="frameShape"> - <enum>QFrame::NoFrame</enum> + <enum>QFrame::Panel</enum> </property> <property name="frameShadow"> - <enum>QFrame::Raised</enum> + <enum>QFrame::Sunken</enum> + </property> + <property name="text"> + <string/> + </property> + <property name="wordWrap"> + <bool>true</bool> + </property> + </widget> + </item> + <item> + <widget class="QPushButton" name="tcalib_button"> + <property name="enabled"> + <bool>false</bool> + </property> + <property name="text"> + <string>Start calibration</string> + </property> + <property name="checkable"> + <bool>true</bool> </property> - <layout class="QVBoxLayout" name="verticalLayout_2"> - <item> - <widget class="QLabel" name="label_59"> - <property name="text"> - <string>Use only yaw and pitch while calibrating. -Don't roll or change position.</string> - </property> - <property name="alignment"> - <set>Qt::AlignCenter</set> - </property> - <property name="wordWrap"> - <bool>true</bool> - </property> - <property name="openExternalLinks"> - <bool>false</bool> - </property> - </widget> - </item> - <item> - <widget class="QLabel" name="sample_count_display"> - <property name="sizePolicy"> - <sizepolicy hsizetype="Minimum" vsizetype="Maximum"> - <horstretch>0</horstretch> - <verstretch>0</verstretch> - </sizepolicy> - </property> - <property name="text"> - <string/> - </property> - <property name="wordWrap"> - <bool>true</bool> - </property> - </widget> - </item> - <item> - <widget class="QPushButton" name="tcalib_button"> - <property name="enabled"> - <bool>false</bool> - </property> - <property name="text"> - <string>Start calibration</string> - </property> - <property name="checkable"> - <bool>true</bool> - </property> - </widget> - </item> - </layout> </widget> </item> </layout> @@ -364,10 +613,19 @@ Don't roll or change position.</string> </layout> </widget> </item> - <item row="4" column="0"> - <widget class="QCheckBox" name="showNetworkInput"> + <item row="8" column="0"> + <widget class="QLabel" name="resolution_display"> + <property name="autoFillBackground"> + <bool>true</bool> + </property> + <property name="frameShape"> + <enum>QFrame::Panel</enum> + </property> + <property name="frameShadow"> + <enum>QFrame::Sunken</enum> + </property> <property name="text"> - <string>Show Network Input</string> + <string notr="true"/> </property> </widget> </item> |