From ce7a1877895ea494ffb79b2e85e48ec6747954f9 Mon Sep 17 00:00:00 2001 From: Michael Welter Date: Thu, 9 May 2024 22:17:37 +0200 Subject: tracker/nn: Add support for 3x3 pos/size cov matrix (and some cleanup) (#1845) Adds support for 3x3 covariance matrices for position+size estimation. My recent (unreleased) models output those instead of only component wise standard deviations. Overall Accept 3x3 covariance matrix output for position+size Old models still worked fine in my testing Delete commented code which was there previously to support recurrent models. (Won't be used in foreseeable future) Delete leftover code for experiments with closed eyes detection (Likewise, I have given up on that / Is not really needed any more) Improve comments and code quality in the internal filter code --- tracker-neuralnet/model_adapters.cpp | 122 ++++++++++++----------------------- 1 file changed, 41 insertions(+), 81 deletions(-) (limited to 'tracker-neuralnet/model_adapters.cpp') diff --git a/tracker-neuralnet/model_adapters.cpp b/tracker-neuralnet/model_adapters.cpp index a8580a89..f53478af 100644 --- a/tracker-neuralnet/model_adapters.cpp +++ b/tracker-neuralnet/model_adapters.cpp @@ -192,11 +192,15 @@ PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&ses if (session_.GetOutputCount() < 2) throw std::runtime_error("Invalid Model: must have at least two outputs"); - // WARNING UB .. but still ... - // If the model was saved without meta data, it seems the version field is uninitialized. - // In that case reading from it is UB. However, in practice we will just get some arbitrary number - // which is hopefully different from the numbers used by models where the version is set. - if (model_version_ != 2 && model_version_ != 3) + // WARNING: Messy model compatibility issues! + // When reading the initial model release, it did not have the version field set. + // Reading it here will result in some unspecified value. It's probably UB due to + // reading uninitialized memory. But there is little choice. + // Now, detection of this old version is messy ... we have to guess based on the + // number we get. Getting an uninitialized value matching a valid version is unlikely. + // But the real problem is that this line must be updated whenever we want to bump the + // version number!! + if (model_version_ <= 0 || model_version_ > 4) model_version_ = 1; const cv::Size input_image_shape = get_input_image_shape(session_); @@ -224,10 +228,9 @@ PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&ses { "box", TensorSpec{ { 1, 4}, &output_box_[0], output_box_.rows } }, { "rotaxis_scales_tril", TensorSpec{ {1, 3, 3}, output_rotaxis_scales_tril_.val, 9 }}, { "rotaxis_std", TensorSpec{ {1, 3, 3}, output_rotaxis_scales_tril_.val, 9 }}, // TODO: Delete when old models aren't used any more - { "eyes", TensorSpec{ { 1, 2}, output_eyes_.val, output_eyes_.rows }}, - { "pos_size_std", TensorSpec{ {1, 3}, output_coord_scales_.val, output_coord_scales_.rows}}, - { "pos_size_scales", TensorSpec{ {1, 3}, output_coord_scales_.val, output_coord_scales_.rows}}, - //{ "box_std", TensorSpec{ {1, 4}, output_box_scales_.val, output_box_scales_ .rows}} + { "pos_size_std", TensorSpec{ {1, 3}, output_coord_scales_std_.val, output_coord_scales_std_.rows}}, + { "pos_size_scales", TensorSpec{ {1, 3}, output_coord_scales_std_.val, output_coord_scales_std_.rows}}, + { "pos_size_scales_tril", TensorSpec{ {1, 3, 3}, output_coord_scales_tril_.val, 9}} }; qDebug() << "Pose model inputs (" << session_.GetInputCount() << ")"; @@ -236,17 +239,17 @@ PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&ses output_c_names_.resize(session_.GetOutputCount()); for (size_t i=0; isecond; + TensorSpec& t = my_tensor_spec_it->second; if (onnx_tensor_spec.GetShape() != t.shape || onnx_tensor_spec.GetElementType() != Ort::TypeToTensorType::type) throw std::runtime_error("Invalid output tensor spec for "s + name); @@ -266,29 +269,9 @@ PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&ses has_uncertainty_ = understood_outputs.at("rotaxis_scales_tril").available || understood_outputs.at("rotaxis_std").available; has_uncertainty_ &= understood_outputs.at("pos_size_std").available || - understood_outputs.at("pos_size_scales").available; - //has_uncertainty_ &= understood_outputs.at("box_std").available; - has_eye_closed_detection_ = understood_outputs.at("eyes").available; - - // FIXME: Recurrent states - - // size_t num_regular_outputs = 2; - - // num_recurrent_states_ = session_.GetInputCount()-1; - // if (session_.GetOutputCount()-num_regular_outputs != num_recurrent_states_) - // throw std::runtime_error("Invalid Model: After regular inputs and outputs the model must have equal number of inputs and outputs for tensors holding hidden states of recurrent layers."); - - // // Create tensors for recurrent state - // for (size_t i = 0; i < num_recurrent_states_; ++i) - // { - // const auto& input_info = session_.GetInputTypeInfo(1+i); - // const auto& output_info = session_.GetOutputTypeInfo(num_regular_outputs+i); - // if (input_info.GetTensorTypeAndShapeInfo().GetShape() != - // output_info.GetTensorTypeAndShapeInfo().GetShape()) - // throw std::runtime_error("Invalid Model: Tensors for recurrent hidden states should have same shape on intput and output"); - // input_val_.push_back(create_tensor(input_info, allocator_)); - // output_val_.push_back(create_tensor(output_info, allocator_)); - // } + understood_outputs.at("pos_size_scales").available || + understood_outputs.at("pos_size_scales_tril").available; + pos_scale_uncertainty_is_matrix_ = understood_outputs.at("pos_size_scales_tril").available; input_names_.resize(session_.GetInputCount()); input_c_names_.resize(session_.GetInputCount()); @@ -348,38 +331,32 @@ std::optional PoseEstimator::run( return {}; } - for (size_t i = 0; i PoseEstimator::run( assert(output_rotaxis_scales_tril_(0, 1) == 0); assert(output_rotaxis_scales_tril_(0, 2) == 0); assert(output_rotaxis_scales_tril_(1, 2) == 0); + assert(center_size_cov_tril(0, 1) == 0); + assert(center_size_cov_tril(0, 2) == 0); + assert(center_size_cov_tril(1, 2) == 0); cv::Matx33f rotaxis_scales_tril = output_rotaxis_scales_tril_; @@ -407,29 +387,9 @@ std::optional PoseEstimator::run( 0.5f*patch_size*(output_box_[2]-output_box_[0]), 0.5f*patch_size*(output_box_[3]-output_box_[1]) }; - // const RoiCorners outbox = { - // patch_center + 0.5f*patch_size*cv::Point2f{output_box_[0], output_box_[1]}, - // patch_center + 0.5f*patch_size*cv::Point2f{output_box_[2], output_box_[3]} - // }; - // RoiCorners outbox_stddev = { - // 0.5f*patch_size*cv::Point2f{output_box_scales_[0], output_box_scales_[1]}, - // 0.5f*patch_size*cv::Point2f{output_box_scales_[2], output_box_scales_[3]} - // }; - - // Because the model is sensitive to closing eyes we increase the uncertainty - // a lot to make the subsequent filtering smooth the output more. This should suppress - // "twitching" when the user blinks. - if (has_eye_closed_detection_) - { - const float eye_open = std::min(output_eyes_[0], output_eyes_[1]); - const float increase_factor = 1.f + 10.f * std::pow(1. - eye_open,4.f); - rotaxis_scales_tril *= increase_factor; - size_stddev *= increase_factor; - center_stddev *= increase_factor; - } return std::optional({ - rotation, rotaxis_scales_tril, outbox, center, center_stddev, size, size_stddev + rotation, rotaxis_scales_tril, outbox, center, size, center_size_cov_tril }); } -- cgit v1.2.3