diff options
author | Michael Welter <michael@welter-4d.de> | 2022-09-11 20:55:26 +0200 |
---|---|---|
committer | Stanislaw Halik <sthalik@misaki.pl> | 2022-11-01 13:51:35 +0100 |
commit | 08f1fcad1c74e25f97641a0ccbd229b267ec528c (patch) | |
tree | 000b1b276bc7df4a74fd493dab05bcce68801de8 /tracker-neuralnet/model_adapters.cpp | |
parent | 77d6abaf53dbe2ee6334bd59b112e25d694a2f65 (diff) |
tracker/nn: Tweaks, refactoring, a deadzone filtering and support for uncertainty estimation
* Add rudimentary test for two functions .. maybe more in future
* Fix the rotation correction from vertical translation
* Move preview class to new files
* Move neural network model adapters to new files
* Add utility functions for opencv
* Query the model inputs/outputs by name to see what is available
* Supports outputs for standard deviation of the data distribution -
What you get if you let your model output the full parameters of a
gaussian distribution (depending on the inputs) and fit it with
negative log likelihood loss.
* Disabled support for sequence models
* Add support for detection of eye open/close classification.
Scale uncertainty estimate up if eyes closed
* Add a deadzone filter which activates if the model supports uncertainty
quantification. The deadzone scales becomes larger the more uncertain
the model/data are. This is mostly supposed to be useful to suppress
large estimate errors when the user blinks with the eyes
* Fix distance being twice of what it should have been
Diffstat (limited to 'tracker-neuralnet/model_adapters.cpp')
-rw-r--r-- | tracker-neuralnet/model_adapters.cpp | 433 |
1 files changed, 433 insertions, 0 deletions
diff --git a/tracker-neuralnet/model_adapters.cpp b/tracker-neuralnet/model_adapters.cpp new file mode 100644 index 00000000..af599321 --- /dev/null +++ b/tracker-neuralnet/model_adapters.cpp @@ -0,0 +1,433 @@ +#include "model_adapters.h" + +#include "compat/timer.hpp" + +#include <opencv2/core.hpp> +#include <opencv2/core/quaternion.hpp> +#include <opencv2/imgproc.hpp> + +#include <QDebug> + + +namespace neuralnet_tracker_ns +{ + + +float sigmoid(float x) +{ + return 1.f/(1.f + std::exp(-x)); +} + + +// Defined in ftnoir_tracker_neuralnet.cpp +// Normally we wouldn't need it here. However ... see below. +cv::Quatf image_to_world(cv::Quatf q); + + +cv::Quatf world_to_image(cv::Quatf q) +{ + // It's its own inverse. + return image_to_world(q); +} + + +cv::Rect2f unnormalize(const cv::Rect2f &r, int h, int w) +{ + auto unnorm = [](float x) -> float { return 0.5*(x+1); }; + auto tl = r.tl(); + auto br = r.br(); + auto x0 = unnorm(tl.x)*w; + auto y0 = unnorm(tl.y)*h; + auto x1 = unnorm(br.x)*w; + auto y1 = unnorm(br.y)*h; + return { + x0, y0, x1-x0, y1-y0 + }; +} + + +// Returns width and height of the input tensor, or throws. +// Expects the model to take one tensor as input that must +// have the shape B x C x H x W, where B=C=1. +cv::Size get_input_image_shape(const Ort::Session &session) +{ + if (session.GetInputCount() < 1) + throw std::invalid_argument("Model must take at least one input tensor"); + const std::vector<std::int64_t> shape = + session.GetInputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape(); + if (shape.size() != 4) + throw std::invalid_argument("Model takes the input tensor in the wrong shape"); + return { static_cast<int>(shape[3]), static_cast<int>(shape[2]) }; +} + + +Ort::Value create_tensor(const Ort::TypeInfo& info, Ort::Allocator& alloc) +{ + const auto shape = info.GetTensorTypeAndShapeInfo().GetShape(); + auto t = Ort::Value::CreateTensor<float>( + alloc, shape.data(), shape.size()); + memset(t.GetTensorMutableData<float>(), 0, sizeof(float)*info.GetTensorTypeAndShapeInfo().GetElementCount()); + return t; +} + + +int find_input_intensity_quantile(const cv::Mat& frame, float percentage) +{ + const int channels[] = { 0 }; + const int hist_size[] = { 256 }; + float range[] = { 0, 256 }; + const float* ranges[] = { range }; + cv::Mat hist; + cv::calcHist(&frame, 1, channels, cv::Mat(), hist, 1, hist_size, ranges, true, false); + int gray_level = 0; + const int num_pixels_quantile = frame.total()*percentage*0.01f; + int num_pixels_accum = 0; + for (int i=0; i<hist_size[0]; ++i) + { + num_pixels_accum += hist.at<float>(i); + if (num_pixels_accum > num_pixels_quantile) + { + gray_level = i; + break; + } + } + return gray_level; +} + + +// Automatic brightness adjustment. Scales brightness to lie between -.5 and 0.5, roughly. +void normalize_brightness(const cv::Mat& frame, cv::Mat& out) +{ + const float pct = 90; + + const int brightness = find_input_intensity_quantile(frame, pct); + + const double alpha = brightness<127 ? (pct/100.f*0.5f/std::max(5,brightness)) : 1./255; + const double beta = -0.5; + + frame.convertTo(out, CV_32F, alpha, beta); +} + + + +Localizer::Localizer(Ort::MemoryInfo &allocator_info, Ort::Session &&session) : + session_{std::move(session)}, + scaled_frame_(INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH, CV_8U), + input_mat_(INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH, CV_32F) +{ + // Only works when input_mat does not reallocated memory ...which it should not. + // Non-owning memory reference to input_mat? + // Note: shape = (bach x channels x h x w) + const std::int64_t input_shape[4] = { 1, 1, INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH }; + input_val_ = Ort::Value::CreateTensor<float>(allocator_info, input_mat_.ptr<float>(0), input_mat_.total(), input_shape, 4); + + const std::int64_t output_shape[2] = { 1, 5 }; + output_val_ = Ort::Value::CreateTensor<float>(allocator_info, results_.data(), results_.size(), output_shape, 2); +} + + +std::pair<float, cv::Rect2f> Localizer::run( + const cv::Mat &frame) +{ + auto p = input_mat_.ptr(0); + + cv::resize(frame, scaled_frame_, { INPUT_IMG_WIDTH, INPUT_IMG_HEIGHT }, 0, 0, cv::INTER_AREA); + scaled_frame_.convertTo(input_mat_, CV_32F, 1./255., -0.5); + + assert (input_mat_.ptr(0) == p); + assert (!input_mat_.empty() && input_mat_.isContinuous()); + assert (input_mat_.cols == INPUT_IMG_WIDTH && input_mat_.rows == INPUT_IMG_HEIGHT); + + const char* input_names[] = {"x"}; + const char* output_names[] = {"logit_box"}; + + Timer t; t.start(); + + session_.Run(Ort::RunOptions{nullptr}, input_names, &input_val_, 1, output_names, &output_val_, 1); + + last_inference_time_ = t.elapsed_ms(); + + const cv::Rect2f roi = unnormalize(cv::Rect2f{ + results_[1], + results_[2], + results_[3]-results_[1], // Width + results_[4]-results_[2] // Height + }, frame.rows, frame.cols); + const float score = sigmoid(results_[0]); + + return { score, roi }; +} + + +double Localizer::last_inference_time_millis() const +{ + return last_inference_time_; +} + + +PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&session) + : model_version_{session.GetModelMetadata().GetVersion()} + , session_{std::move(session)} + , allocator_{session_, allocator_info} +{ + using namespace std::literals::string_literals; + + if (session_.GetOutputCount() < 2) + throw std::runtime_error("Invalid Model: must have at least two outputs"); + + // WARNING UB .. but still ... + // If the model was saved without meta data, it seems the version field is uninitialized. + // In that case reading from it is UB. However, in practice we will just get some arbitrary number + // which is hopefully different from the numbers used by models where the version is set. + if (model_version_ != 2 && model_version_ != 3) + model_version_ = 1; + + const cv::Size input_image_shape = get_input_image_shape(session_); + + scaled_frame_ = cv::Mat(input_image_shape, CV_8U, cv::Scalar(0)); + input_mat_ = cv::Mat(input_image_shape, CV_32F, cv::Scalar(0.f)); + + { + const std::int64_t input_shape[4] = { 1, 1, input_image_shape.height, input_image_shape.width }; + input_val_.push_back( + Ort::Value::CreateTensor<float>(allocator_info, input_mat_.ptr<float>(0), input_mat_.total(), input_shape, 4)); + } + + struct TensorSpec + { + std::vector<int64_t> shape; + float* buffer = nullptr; + size_t element_count = 0; + bool available = false; + }; + + std::unordered_map<std::string, TensorSpec> understood_outputs = { + { "pos_size", TensorSpec{ { 1, 3 }, &output_coord_[0], output_coord_.rows } }, + { "quat", TensorSpec{ { 1, 4}, &output_quat_[0], output_quat_.rows } }, + { "box", TensorSpec{ { 1, 4}, &output_box_[0], output_box_.rows } }, + { "rotaxis_scales_tril", TensorSpec{ {1, 3, 3}, output_rotaxis_scales_tril_.val, 9 }}, + { "rotaxis_std", TensorSpec{ {1, 3, 3}, output_rotaxis_scales_tril_.val, 9 }}, // TODO: Delete when old models aren't used any more + { "eyes", TensorSpec{ { 1, 2}, output_eyes_.val, output_eyes_.rows }}, + { "pos_size_std", TensorSpec{ {1, 3}, output_coord_scales_.val, output_coord_scales_.rows}}, + { "pos_size_scales", TensorSpec{ {1, 3}, output_coord_scales_.val, output_coord_scales_.rows}}, + //{ "box_std", TensorSpec{ {1, 4}, output_box_scales_.val, output_box_scales_ .rows}} + }; + + qDebug() << "Pose model inputs (" << session_.GetInputCount() << ")"; + qDebug() << "Pose model outputs (" << session_.GetOutputCount() << "):"; + for (size_t i=0; i<session_.GetOutputCount(); ++i) + { + const char* name = session_.GetOutputName(i, allocator_); + const auto& output_info = session_.GetOutputTypeInfo(i); + const auto& onnx_tensor_spec = output_info.GetTensorTypeAndShapeInfo(); + auto my_tensor_spec = understood_outputs.find(name); + + qDebug() << "\t" << name << " (" << onnx_tensor_spec.GetShape() << ") dtype: " << onnx_tensor_spec.GetElementType() << " " << + (my_tensor_spec != understood_outputs.end() ? "ok" : "unknown"); + + if (my_tensor_spec != understood_outputs.end()) + { + TensorSpec& t = my_tensor_spec->second; + if (onnx_tensor_spec.GetShape() != t.shape || + onnx_tensor_spec.GetElementType() != Ort::TypeToTensorType<float>::type) + throw std::runtime_error("Invalid output tensor spec for "s + name); + output_val_.push_back(Ort::Value::CreateTensor<float>( + allocator_info, t.buffer, t.element_count, t.shape.data(), t.shape.size())); + t.available = true; + } + else + { + // Create tensor regardless and ignore output + output_val_.push_back(create_tensor(output_info, allocator_)); + } + output_names_.push_back(name); + } + + has_uncertainty_ = understood_outputs.at("rotaxis_scales_tril").available || + understood_outputs.at("rotaxis_std").available; + has_uncertainty_ &= understood_outputs.at("pos_size_std").available || + understood_outputs.at("pos_size_scales").available; + //has_uncertainty_ &= understood_outputs.at("box_std").available; + has_eye_closed_detection_ = understood_outputs.at("eyes").available; + + // FIXME: Recurrent states + + // size_t num_regular_outputs = 2; + + // num_recurrent_states_ = session_.GetInputCount()-1; + // if (session_.GetOutputCount()-num_regular_outputs != num_recurrent_states_) + // throw std::runtime_error("Invalid Model: After regular inputs and outputs the model must have equal number of inputs and outputs for tensors holding hidden states of recurrent layers."); + + // // Create tensors for recurrent state + // for (size_t i = 0; i < num_recurrent_states_; ++i) + // { + // const auto& input_info = session_.GetInputTypeInfo(1+i); + // const auto& output_info = session_.GetOutputTypeInfo(num_regular_outputs+i); + // if (input_info.GetTensorTypeAndShapeInfo().GetShape() != + // output_info.GetTensorTypeAndShapeInfo().GetShape()) + // throw std::runtime_error("Invalid Model: Tensors for recurrent hidden states should have same shape on intput and output"); + // input_val_.push_back(create_tensor(input_info, allocator_)); + // output_val_.push_back(create_tensor(output_info, allocator_)); + // } + + for (size_t i = 0; i < session_.GetInputCount(); ++i) + { + input_names_.push_back(session_.GetInputName(i, allocator_)); + } + + assert (input_names_.size() == input_val_.size()); + assert (output_names_.size() == output_val_.size()); +} + + +std::optional<PoseEstimator::Face> PoseEstimator::run( + const cv::Mat &frame, const cv::Rect &box) +{ + cv::Mat cropped; + + const int patch_size = std::max(box.width, box.height)*1.05; + const cv::Point2f patch_center = { + std::clamp<float>(box.x + 0.5f*box.width, 0.f, frame.cols), + std::clamp<float>(box.y + 0.5f*box.height, 0.f, frame.rows) + }; + cv::getRectSubPix(frame, {patch_size, patch_size}, patch_center, cropped); + + // Will get failure if patch_center is outside image boundaries settings. + // Have to catch this case. + if (cropped.rows != patch_size || cropped.cols != patch_size) + return {}; + + [[maybe_unused]] auto* p = input_mat_.ptr(0); + + cv::resize(cropped, scaled_frame_, scaled_frame_.size(), 0, 0, cv::INTER_AREA); + + normalize_brightness(scaled_frame_, input_mat_); + + assert (input_mat_.ptr(0) == p); + assert (!input_mat_.empty() && input_mat_.isContinuous()); + + Timer t; t.start(); + + try + { + session_.Run( + Ort::RunOptions{ nullptr }, + input_names_.data(), + input_val_.data(), + input_val_.size(), + output_names_.data(), + output_val_.data(), + output_val_.size()); + } + catch (const Ort::Exception &e) + { + qDebug() << "Failed to run the model: " << e.what(); + return {}; + } + + for (size_t i = 0; i<num_recurrent_states_; ++i) + { + // Next step, the current output becomes the input. + // Thus we realize the recurrent connection. + // Only swaps the internal pointers. There is no copy of data. + std::swap( + output_val_[output_val_.size()-num_recurrent_states_+i], + input_val_[input_val_.size()-num_recurrent_states_+i]); + } + + // FIXME: Execution time fluctuates wildly. 19 to 26 msec. Why? + // The instructions are always the same. Maybe a memory allocation + // issue. The ONNX api suggests that tensor are allocated in an + // arena. Does that matter? Maybe the issue is something else? + + last_inference_time_ = t.elapsed_ms(); + + // Perform coordinate transformation. + // From patch-local normalized in [-1,1] to + // frame unnormalized pixel coordinatesettings. + + const cv::Point2f center = patch_center + + (0.5f*patch_size)*cv::Point2f{output_coord_[0], output_coord_[1]}; + + cv::Point2f center_stddev = { + (0.5f*patch_size)*output_coord_scales_[0], + (0.5f*patch_size)*output_coord_scales_[1] }; + + const float size = patch_size*0.5f*output_coord_[2]; + + float size_stddev = patch_size*0.5f*output_coord_scales_[2]; + + // Following Eigen which uses quat components in the order w, x, y, z. + // As does OpenCV + cv::Quatf rotation = { + output_quat_[3], + output_quat_[0], + output_quat_[1], + output_quat_[2] }; + + // Should be lower triangular. If not maybe something is wrong with memory layout ... or the model. + assert(output_rotaxis_scales_tril_(0, 1) == 0); + assert(output_rotaxis_scales_tril_(0, 2) == 0); + assert(output_rotaxis_scales_tril_(1, 2) == 0); + + cv::Matx33f rotaxis_scales_tril = output_rotaxis_scales_tril_; + + if (model_version_ < 2) + { + // Due to a change in coordinate conventions + rotation = world_to_image(rotation); + } + + const cv::Rect2f outbox = { + patch_center.x + (0.5f*patch_size)*output_box_[0], + patch_center.y + (0.5f*patch_size)*output_box_[1], + 0.5f*patch_size*(output_box_[2]-output_box_[0]), + 0.5f*patch_size*(output_box_[3]-output_box_[1]) + }; + // const RoiCorners outbox = { + // patch_center + 0.5f*patch_size*cv::Point2f{output_box_[0], output_box_[1]}, + // patch_center + 0.5f*patch_size*cv::Point2f{output_box_[2], output_box_[3]} + // }; + // RoiCorners outbox_stddev = { + // 0.5f*patch_size*cv::Point2f{output_box_scales_[0], output_box_scales_[1]}, + // 0.5f*patch_size*cv::Point2f{output_box_scales_[2], output_box_scales_[3]} + // }; + + // Because the model is sensitive to closing eyes we increase the uncertainty + // a lot to make the subsequent filtering smooth the output more. This should suppress + // "twitching" when the user blinks. + if (has_eye_closed_detection_) + { + const float eye_open = std::min(output_eyes_[0], output_eyes_[1]); + const float increase_factor = 1.f + 10.f * std::pow(1. - eye_open,4.f); + rotaxis_scales_tril *= increase_factor; + size_stddev *= increase_factor; + center_stddev *= increase_factor; + } + + return std::optional<Face>({ + rotation, rotaxis_scales_tril, outbox, center, center_stddev, size, size_stddev + }); +} + + +cv::Mat PoseEstimator::last_network_input() const +{ + assert(!input_mat_.empty()); + cv::Mat ret; + input_mat_.convertTo(ret, CV_8U, 255., 127.); + cv::cvtColor(ret, ret, cv::COLOR_GRAY2RGB); + return ret; +} + + +double PoseEstimator::last_inference_time_millis() const +{ + return last_inference_time_; +} + + + + + +} // namespace neuralnet_tracker_ns
\ No newline at end of file |