From 08f1fcad1c74e25f97641a0ccbd229b267ec528c Mon Sep 17 00:00:00 2001
From: Michael Welter <michael@welter-4d.de>
Date: Sun, 11 Sep 2022 20:55:26 +0200
Subject: tracker/nn: Tweaks, refactoring, a deadzone filtering and support for
 uncertainty estimation

* Add rudimentary test for two functions .. maybe more in future
* Fix the rotation correction from vertical translation
* Move preview class to new files
* Move neural network model adapters to new files
* Add utility functions for opencv
* Query the model inputs/outputs by name to see what is available
* Supports outputs for standard deviation of the data distribution -
  What you get if you let your model output the full parameters of a
  gaussian distribution (depending on the inputs) and fit it with
  negative log likelihood loss.
* Disabled support for sequence models
* Add support for detection of eye open/close classification.
  Scale uncertainty estimate up if eyes closed
* Add a deadzone filter which activates if the model supports uncertainty
  quantification. The deadzone scales becomes larger the more uncertain
  the model/data are. This is mostly supposed to be useful to suppress
  large estimate errors when the user blinks with the eyes
* Fix distance being twice of what it should have been
---
 tracker-neuralnet/model_adapters.h | 102 +++++++++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)
 create mode 100644 tracker-neuralnet/model_adapters.h

(limited to 'tracker-neuralnet/model_adapters.h')
diff --git a/tracker-neuralnet/model_adapters.h b/tracker-neuralnet/model_adapters.h
new file mode 100644
index 00000000..3fbfb861
--- /dev/null
+++ b/tracker-neuralnet/model_adapters.h
@@ -0,0 +1,102 @@
+#pragma once
+
+#include <optional>
+#include <array>
+#include <vector>
+
+#include <onnxruntime_cxx_api.h>
+#include <opencv2/core.hpp>
+#include "opencv_contrib.h"
+
+
+namespace neuralnet_tracker_ns
+{
+
+// Generally useful sigmoid function
+float sigmoid(float x);
+
+
+class Localizer
+{
+    public:
+        Localizer(Ort::MemoryInfo &allocator_info,
+                    Ort::Session &&session);
+        
+        // Returns bounding wrt image coordinate of the input image
+        // The preceeding float is the score for being a face normalized to [0,1].
+        std::pair<float, cv::Rect2f> run(
+            const cv::Mat &frame);
+
+        double last_inference_time_millis() const;
+    private:
+        inline static constexpr int INPUT_IMG_WIDTH = 288;
+        inline static constexpr int INPUT_IMG_HEIGHT = 224;
+        Ort::Session session_{nullptr};
+        // Inputs / outputs
+        cv::Mat scaled_frame_{}, input_mat_{};
+        Ort::Value input_val_{nullptr}, output_val_{nullptr};
+        std::array<float, 5> results_;
+        double last_inference_time_ = 0;
+};
+
+
+class PoseEstimator
+{
+    public:
+        struct Face
+        {
+            cv::Quatf rotation;
+            cv::Matx33f rotaxis_cov_tril; // Lower triangular factor of Cholesky decomposition
+            cv::Rect2f box;
+            cv::Point2f center;
+            cv::Point2f center_stddev;
+            float size;
+            float size_stddev;
+        };
+
+        PoseEstimator(Ort::MemoryInfo &allocator_info,
+                        Ort::Session &&session);
+        /** Inference
+        *
+        * Coordinates are defined wrt. the image space of the input `frame`.
+        * X goes right, Z (depth) into the image, Y points down (like pixel coordinates values increase from top to bottom)
+        */
+        std::optional<Face> run(const cv::Mat &frame, const cv::Rect &box);
+        // Returns an image compatible with the 'frame' image for displaying.
+        cv::Mat last_network_input() const;
+        double last_inference_time_millis() const;
+        bool has_uncertainty() const { return has_uncertainty_; }
+
+    private:
+        int64_t model_version_ = 0;  // Queried meta data from the ONNX file
+        Ort::Session session_{nullptr};  // ONNX's runtime context for running the model
+        Ort::Allocator allocator_;   // Memory allocator for tensors
+        // Inputs
+        cv::Mat scaled_frame_{}, input_mat_{};  // Input. One is the original crop, the other is rescaled (?)
+        std::vector<Ort::Value> input_val_;    // Tensors to put into the model
+        std::vector<const char*> input_names_; // Refers to the names in the onnx model. 
+        // Outputs
+        cv::Vec<float, 3> output_coord_{};  // 2d Coordinate and head size output.
+        cv::Vec<float, 4> output_quat_{};   //  Quaternion output
+        cv::Vec<float, 4> output_box_{};    // Bounding box output
+        cv::Matx33f output_rotaxis_scales_tril_{}; // Lower triangular matrix of LLT factorization of covariance of rotation vector as offset from output quaternion
+        cv::Vec<float, 2> output_eyes_{};
+        cv::Vec<float, 3> output_coord_scales_{};
+        std::vector<Ort::Value> output_val_; // Tensors to put the model outputs in.
+        std::vector<const char*> output_names_; // Refers to the names in the onnx model.
+        // More bookkeeping
+        size_t num_recurrent_states_ = 0;
+        double last_inference_time_ = 0;
+        bool has_uncertainty_ = false;
+        bool has_eye_closed_detection_ = false;
+};
+
+
+// Finds the intensity where x percent of pixels have less intensity than that.
+int find_input_intensity_quantile(const cv::Mat& frame, float percentage);
+
+// Adjust brightness levels to full range and scales the value range to [-0.5, 0.5]
+void normalize_brightness(const cv::Mat& frame, cv::Mat& out);
+
+
+} // namespace neuralnet_tracker_ns
\ No newline at end of file
-- 
cgit v1.2.3