26 files changed, 4073 insertions, 0 deletions
diff --git a/tracker-neuralnet/BUILD.md b/tracker-neuralnet/BUILD.md
new file mode 100644
index 00000000..b8994b00
--- /dev/null
+++ b/tracker-neuralnet/BUILD.md
@@ -0,0 +1,20 @@
+ONNX Runtime
+------------
+
+Recommended approach on Windws: Build a shared library from sources. Use static MSVC
+runtime library. The v1.6.0 branch should work fine.
+
+Source location: https://github.com/microsoft/onnxruntime
+
+In order to build, execute `build.bat` as follows:
+
+```
+$ build.bat --config Release --x86 --cmake_extra_defines CMAKE_INSTALL_PREFIX="D:\Dev\onnxruntime-x86-release" --build_dir .\buildx86\ --enable_msvc_static_runtime --build_shared_lib --skip_tests --cmake_generator "Visual Studio 15 2017"
+$ cmake --install .\buildx64\Release
+```
+
+Replace the argument for `--cmake_generator` if needed. Also adjust the build-and install directories.
+
+This should place all required files in the directory specified by CMAKE_INSTALL_PREFIX.
+
+See also https://www.onnxruntime.ai/docs/how-to/build.html.
+\ No newline at end of file
diff --git a/tracker-neuralnet/CMakeLists.txt b/tracker-neuralnet/CMakeLists.txt
new file mode 100644
index 00000000..3729c789
--- /dev/null
+++ b/tracker-neuralnet/CMakeLists.txt
@@ -0,0 +1,49 @@
+include(opentrack-opencv)
+set(host-spec "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR} ${CMAKE_SIZEOF_VOID_P}")
+if(host-spec MATCHES "^Linux i[3-6]86 4$")
+    return()
+endif()
+
+find_package(OpenCV QUIET)
+find_package(OpenMP QUIET) # Used to control number of onnx threads.
+find_package(ONNXRuntime QUIET)
+
+if(OpenCV_FOUND AND ONNXRuntime_FOUND AND OpenMP_FOUND)
+    if(MSVC)
+        add_compile_options(-EHsc)
+        add_definitions(-D_HAS_EXCEPTIONS=1)
+    endif()
+
+    otr_module(tracker-neuralnet)
+
+    target_link_libraries(${self}
+        opentrack-cv
+        onnxruntime::onnxruntime
+        opencv_calib3d
+        opencv_imgproc
+        opencv_imgcodecs
+        opencv_core
+        OpenMP::OpenMP_CXX
+        )
+
+    # OpenMP::OpenMP_CXX doesn't set up the -fopenmp linking option, so set it up ourselves.
+    if(NOT MSVC)
+        target_link_options(${self} PUBLIC ${OpenMP_CXX_FLAGS})
+    endif()
+
+    install(
+        FILES "models/head-localizer.onnx" 
+              "models/head-pose-0.2-big.onnx"
+              "models/head-pose-0.2-small.onnx"
+              "models/head-pose-0.3-big-quantized.onnx"
+        DESTINATION "${opentrack-libexec}/models"
+        PERMISSIONS ${opentrack-perms-file}
+        )
+
+    if(WIN32)
+        otr_install_lib("${ONNXRuntime_RUNTIME}" ".")
+    endif()
+    if(MSVC)
+        otr_install_lib("redist/vcomp140.dll" "${opentrack-bin}")
+    endif()
+endif()
diff --git a/tracker-neuralnet/deadzone_filter.cpp b/tracker-neuralnet/deadzone_filter.cpp
new file mode 100644
index 00000000..fa96eeb3
--- /dev/null
+++ b/tracker-neuralnet/deadzone_filter.cpp
@@ -0,0 +1,173 @@
+#include "deadzone_filter.h"
+#include "model_adapters.h"
+#include "opencv_contrib.h"
+#include "unscented_trafo.h"
+
+#include <tuple>
+#include <opencv2/core/base.hpp>
+#include <opencv2/core/matx.hpp>
+#include <opencv2/core/quaternion.hpp>
+
+namespace neuralnet_tracker_ns
+{
+
+using namespace cvcontrib;
+
+// Number of degrees of freedom of position and rotation
+static constexpr int dofs = 6;
+
+using StateVec = cv::Vec<float,dofs>;
+using StateCov = cv::Matx<float,dofs,dofs>;
+
+static constexpr int num_sigmas = ukf_cv::MerweScaledSigmaPoints<dofs>::num_sigmas;
+// Rescaling factor for position/size living in the space of the face crop.
+// Applied prior to application of UKF to prevent numerical problems.
+static constexpr float img_scale = 200.f;
+// Similar rescaling factor for position/size that live in world space.
+static constexpr float world_scale = 1000.f; // mm
+
+// Fills the 6 DoF covariance factor, as in L L^T factorization.
+// Covariance is given wrt the tangent space of current predictions
+StateCov make_tangent_space_uncertainty_tril(const PoseEstimator::Face &face)
+{
+    StateCov tril = StateCov::eye();
+    set_minor<3,3>(tril, 0, 0, face.center_size_cov_tril / img_scale);
+    set_minor<3,3>(tril, 3, 3, face.rotaxis_cov_tril);
+    return tril;
+}
+
+
+QuatPose apply_offset(const QuatPose& pose, const StateVec& offset)
+{
+    // Unpack
+    const cv::Vec3f dp = { offset[0], offset[1], offset[2] };
+    const cv::Quatf dr = cv::Quatf::createFromRvec(cv::Vec3f{ offset[3], offset[4], offset[5] });
+    const auto p = pose.pos + dp;
+    const auto r = pose.rot * dr;
+    return { r, p };
+}
+
+
+std::tuple<cv::Quatf, cv::Point2f, float> apply_offset(const PoseEstimator::Face& face, const StateVec& offset)
+{
+    const cv::Quatf dr = cv::Quatf::createFromRvec(cv::Vec3f{ offset[3], offset[4], offset[5] });
+    const auto r = face.rotation * dr;
+    
+    const cv::Point2f p = { 
+        face.center.x + offset[0]*img_scale, 
+        face.center.y + offset[1]*img_scale 
+    };
+
+    // Intercept the case where the head size stddev became so large that the sigma points
+    // were created with negative head size (mean - constant*stddev ...). Negative head size
+    // is bad. But this is fine. The unscented transform where this function comes into play
+    // is designed to handle non-linearities like this.
+    const float sz = std::max(0.1f*face.size, face.size + offset[2]*img_scale);
+
+    return {
+        r,
+        p,
+        sz,
+    };
+}
+
+
+StateVec relative_to(const QuatPose& reference, const QuatPose& pose)
+{
+    const auto p = pose.pos - reference.pos;
+    const auto r = toRotVec(reference.rot.conjugate()*pose.rot);
+    return StateVec{ p[0], p[1], p[2], r[0], r[1], r[2] };
+}
+
+
+ukf_cv::SigmaPoints<dofs> relative_to(const QuatPose& pose, const std::array<QuatPose,num_sigmas>& sigmas)
+{
+    ukf_cv::SigmaPoints<dofs> out; // Beware, the number of points is != the number of DoFs.
+    std::transform(sigmas.begin(), sigmas.end(), out.begin(), [&pose](const QuatPose& s) {
+        return relative_to(pose, s);
+    });
+    return out;
+}
+
+
+std::array<QuatPose,num_sigmas> compute_world_pose_from_sigma_point(const PoseEstimator::Face& face, const ukf_cv::SigmaPoints<dofs>& sigmas, Face2WorldFunction face2world)
+{
+    std::array<QuatPose,num_sigmas> out;
+    std::transform(sigmas.begin(), sigmas.end(), out.begin(), [face2world=std::move(face2world), &face](const StateVec& sigma_point) {
+        // First unpack the state vector and generate quaternion rotation w.r.t image space.
+        const auto [rotation, center, size] = apply_offset(face, sigma_point);
+        // Then transform ...
+        QuatPose pose = face2world(rotation, center, size);
+        pose.pos /= world_scale;
+        return pose;
+    });
+    return out;
+}
+
+
+StateVec apply_filter_to_offset(const StateVec& offset, const StateCov& offset_cov, float, const FiltParams& params)
+{
+    // Offset and Cov represent a multivariate normal distribution, which is the probability of the new pose measured w.r.t the previous one.
+    // Prob(x) ~exp(-(x-mu)t Cov^-1 (x-mu))
+    // We want to attenuate this offset, or zero it out completely, to obtain a deadzone-filter behaviour. The size of the deadzone shall be
+    // determined by the covariance projected to the offset direction like so:
+    // Take x = mu - mu / |mu| * alpha
+    // p(alpha) ~exp(-alpha^2 / |mu|^2 * mut Cov^-1 mu) = ~exp(-alpha^2 / sigma^2) with sigma^2 = mut Cov^-1 mu / |mu|^2.
+    // So this projection is like a 1d normal distribution with some standard deviation, which we take to scale the deadzone.
+
+    bool ok = true;
+
+    const float len_div_sigma_sqr = offset.dot(offset_cov.inv(cv::DECOMP_CHOLESKY, &ok) * offset);
+    
+    const float attenuation = (ok) ? sigmoid((std::sqrt(len_div_sigma_sqr) - params.deadzone_size)*params.deadzone_hardness) : 1.f;
+
+    // {
+    //     std::cout << "cov diag: " << offset_cov.diag() << std::endl;
+    //     std::cout << "offset: " << cv::norm(offset) << std::endl;
+    //     std::cout << "len_div_sigma_sqr: " << cv::norm(len_div_sigma_sqr) << std::endl;
+    //      std::cout << "attenuation (" << ok << "): " << attenuation << std::endl;
+    // }
+
+    return offset*attenuation;
+}
+
+
+QuatPose apply_filter(const PoseEstimator::Face &face, const QuatPose& previous_pose_, float dt, Face2WorldFunction face2world, const FiltParams& params)
+{
+    ukf_cv::MerweScaledSigmaPoints<dofs> unscentedtrafo;
+    auto previous_pose = previous_pose_;
+    previous_pose.pos /= world_scale;
+
+    // Get 6 DoF covariance factor for the predictions in the face crop space.
+    const auto cov_tril = make_tangent_space_uncertainty_tril(face);
+
+    // Compute so called sigma points. These represent the distribution from the covariance matrix in terms of 
+    // sampling points.
+    const ukf_cv::SigmaPoints<dofs> sigmas = unscentedtrafo.compute_sigmas(to_vec(StateVec::zeros()), cov_tril, true);
+
+    // The filter uses an unscented transform to translate that into a distribution for the offset from the previous pose.
+    // The trick is to transform the sampling points and compute a covariance from them in the output space.
+    // We have many of these sigma points. This is why that callback comes into play here.
+    // The transform to 3d world space is more than Face2WorldFunction because we also need to apply the sigma point (as
+    // a relative offset) to the pose in face crop space.
+    const std::array<QuatPose,num_sigmas> pose_sigmas = compute_world_pose_from_sigma_point(face, sigmas, std::move(face2world));
+
+    // Compute sigma points relative to the previous pose
+    const ukf_cv::SigmaPoints<dofs> deltas_sigmas = relative_to(previous_pose, pose_sigmas);
+
+    // Compute the mean offset from the last pose and the spread due to the networks uncertainty output.
+    const auto [offset, offset_cov] = unscentedtrafo.compute_statistics(deltas_sigmas);
+
+    // Then the deadzone is applied to the offset and finally the previous pose is transformed by the offset to arrive 
+    // at the final output.
+    const StateVec scaled_offset = apply_filter_to_offset(offset, offset_cov, dt, params);
+
+    QuatPose new_pose = apply_offset(previous_pose, scaled_offset);
+
+    new_pose.pos *= world_scale;
+
+    return new_pose;
+}
+
+
+} // namespace neuralnet_tracker_ns
+\ No newline at end of file
diff --git a/tracker-neuralnet/deadzone_filter.h b/tracker-neuralnet/deadzone_filter.h
new file mode 100644
index 00000000..a9b6aada
--- /dev/null
+++ b/tracker-neuralnet/deadzone_filter.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include "unscented_trafo.h"
+#include "opencv_contrib.h"
+#include "model_adapters.h"
+
+namespace neuralnet_tracker_ns
+{
+
+/// Represents a 6d pose by quaternion rotation and position vector.
+struct QuatPose {
+    cv::Quatf rot;
+    cv::Vec3f pos;
+};
+
+struct FiltParams
+{
+    float deadzone_hardness = 1.f;
+    float deadzone_size = 1.f;
+};
+
+/** Callback type for converting data from the `Face` struct to a 6d pose.
+*
+* This callback is needed because it depends on things that the filter doesn't have to know about and it is called multiple times
+* due to the way how uncertainty estimates are handled
+*/
+using Face2WorldFunction = std::function<QuatPose (const cv::Quatf&, const cv::Point2f&, float)>;
+
+/** Applies a deadzone filter similar to the one used in the Hamilton filter.
+*
+* What sets this apart is that the deadzone size scales with the uncertainty estimate of the network.
+* The rotation uncertainty is represented by a covariance matrix for the distribution of a rotation vector which
+* describes the offset from the mean rotation (the quaternion in the `Face` struct).
+*/
+QuatPose apply_filter(const PoseEstimator::Face &face, const QuatPose& previous_pose, float dt, Face2WorldFunction face2world, const FiltParams& params);
+
+} // namespace neuralnet_tracker_ns
+\ No newline at end of file
diff --git a/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp b/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp
new file mode 100644
index 00000000..c55ddf0c
--- /dev/null
+++ b/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp
@@ -0,0 +1,976 @@
+/* Copyright (c) 2021 Michael Welter <michael@welter-4d.de>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ */
+
+#include "ftnoir_tracker_neuralnet.h"
+#include "deadzone_filter.h"
+#include "opencv_contrib.h"
+
+#include "compat/sleep.hpp"
+#include "compat/math-imports.hpp"
+#include "compat/timer.hpp"
+#include "compat/check-visible.hpp"
+#include "cv/init.hpp"
+
+#include <omp.h>
+#include <onnxruntime_cxx_api.h>
+#include <opencv2/core.hpp>
+#include <opencv2/core/quaternion.hpp>
+
+#ifdef _MSC_VER
+#   pragma warning(disable : 4702)
+#endif
+
+#include <QMutexLocker>
+#include <QDebug>
+#include <QFile>
+#include <QFileDialog>
+#include <QFileInfo>
+
+#include <cstdio>
+#include <cmath>
+#include <algorithm>
+#include <chrono>
+#include <string>
+#include <stdexcept>
+#include <unordered_map>
+
+// Some demo code for onnx
+// https://github.com/microsoft/onnxruntime/blob/master/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/C_Api_Sample.cpp
+// https://github.com/leimao/ONNX-Runtime-Inference/blob/main/src/inference.cpp
+
+namespace neuralnet_tracker_ns
+{
+
+using namespace cvcontrib;
+
+using numeric_types::vec3;
+using numeric_types::vec2;
+using numeric_types::mat33;
+
+#if _MSC_VER
+std::wstring convert(const QString &s) { return s.toStdWString(); }
+#else
+std::string convert(const QString &s) { return s.toStdString(); }
+#endif
+
+
+QDir get_default_model_directory()
+{
+    return QDir(OPENTRACK_BASE_PATH+ "/" OPENTRACK_LIBRARY_PATH "models");
+}
+
+
+int enum_to_fps(int value)
+{
+    int fps = 0;
+
+    switch (value)
+    {
+    default: eval_once(qDebug() << "neuralnet tracker: invalid fps enum value");
+    [[fallthrough]];
+    case fps_default:   fps = 0; break;
+    case fps_30:        fps = 30; break;
+    case fps_60:        fps = 60; break;
+    case fps_75:        fps = 75; break;
+    case fps_125:       fps = 125; break;
+    case fps_200:       fps = 200; break;
+    case fps_50:        fps = 50; break;
+    case fps_100:       fps = 100; break;
+    case fps_120:       fps = 120; break;
+    case fps_300:       fps = 300; break;
+    case fps_250:       fps = 250; break;
+    }
+
+    return fps;
+}
+
+
+template<class F>
+struct OnScopeExit
+{
+    explicit OnScopeExit(F&& f) : f_{ f } {}
+    ~OnScopeExit() noexcept
+    {
+        f_();
+    }
+    F f_;
+};
+
+
+CamIntrinsics make_intrinsics(const cv::Mat& img, const Settings& settings)
+{
+    const int w = img.cols, h = img.rows;
+    const double diag_fov = settings.fov * M_PI / 180.;
+    const double fov_w = 2.*atan(tan(diag_fov/2.)/sqrt(1. + h/(double)w * h/(double)w));
+    const double fov_h = 2.*atan(tan(diag_fov/2.)/sqrt(1. + w/(double)h * w/(double)h));
+    const double focal_length_w = 1. / tan(.5 * fov_w);
+    const double focal_length_h = 1. / tan(.5 * fov_h);
+    /*  a
+      ______  <--- here is sensor area
+      |    /
+      |   /
+    f |  /
+      | /  2 x angle is the fov
+      |/
+        <--- here is the hole of the pinhole camera
+
+      So, a / f = tan(fov / 2)
+      => f = a/tan(fov/2)
+      What is a?
+      1 if we define f in terms of clip space where the image plane goes from -1 to 1. Because a is the half-width.
+    */
+
+    return {
+        (float)focal_length_w,
+        (float)focal_length_h,
+        (float)fov_w,
+        (float)fov_h
+    };
+}
+
+
+cv::Rect make_crop_rect_multiple_of(const cv::Size &size, int multiple)
+{
+    const int new_w = (size.width / multiple) * multiple;
+    const int new_h = (size.height / multiple) * multiple;
+    return cv::Rect(
+        (size.width-new_w)/2,
+        (size.height-new_h)/2,
+        new_w,
+        new_h
+    );
+}
+
+template<class T>
+cv::Rect_<T> squarize(const cv::Rect_<T> &r)
+{
+    cv::Point_<T> c{r.x + r.width/T(2), r.y + r.height/T(2)};
+    const T sz = std::max(r.height, r.width);
+    return {c.x - sz/T(2), c.y - sz/T(2), sz, sz};
+}
+
+
+template<class T>
+cv::Rect_<T> expand(const cv::Rect_<T>& r, T factor)
+{
+    // xnew = l+.5*w - w*f*0.5 = l + .5*(w - new_w)
+    const cv::Size_<T> new_size = { r.width * factor, r.height * factor };
+    const cv::Point_<T> new_tl = r.tl() + (as_point(r.size()) - as_point(new_size)) / T(2);
+    return cv::Rect_<T>(new_tl, new_size);
+}
+
+
+template<class T>
+cv::Rect_<T> ewa_filter(const cv::Rect_<T>& last, const cv::Rect_<T>& current, T alpha)
+{
+    const auto last_center = T(0.5) * (last.tl() + last.br());
+    const auto cur_center = T(0.5) * (current.tl() + current.br());
+    const cv::Point_<T> new_size = as_point(last.size()) + alpha * (as_point(current.size()) - as_point(last.size()));
+    const cv::Point_<T> new_center = last_center + alpha * (cur_center - last_center);
+    return cv::Rect_<T>(new_center - T(0.5) * new_size, as_size(new_size));
+}
+
+
+cv::Vec3f image_to_world(float x, float y, float size, float reference_size_in_mm, const cv::Size2i& image_size, const CamIntrinsics& intrinsics)
+{
+    /*
+    Compute the location the network outputs in 3d space.
+
+       hhhhhh  <- head size (meters)
+      \      | -----------------------
+       \     |                         \
+        \    |                          |
+         \   |                          |- x (meters)
+          ____ <- face.size / width     |
+           \ |  |                       |
+            \|  |- focal length        /
+               ------------------------
+            ------------------------------------------------>> z direction
+        z/x = zi / f
+        zi = image position
+        z = world position
+        f = focal length
+
+        We can also do deltas:
+        dz / x = dzi / f
+        => x = dz / dzi * f
+        which means we can compute x from the head size (dzi) if we assume some reference size (dz).
+    */
+    const float head_size_vertical = 2.f*size;  // Size from the model is more like half the real vertical size of a human head.
+    const float xpos = -(intrinsics.focal_length_w * image_size.width * 0.5f) / head_size_vertical * reference_size_in_mm;
+    const float zpos = (x / image_size.width * 2.f - 1.f) * xpos / intrinsics.focal_length_w;
+    const float ypos = (y / image_size.height * 2.f - 1.f) * xpos / intrinsics.focal_length_h;
+    return {xpos, ypos, zpos};
+}
+
+
+vec2 world_to_image(const cv::Vec3f& pos, const cv::Size2i& image_size, const CamIntrinsics& intrinsics)
+{
+    const float xscr = pos[2] / pos[0] * intrinsics.focal_length_w;
+    const float yscr = pos[1] / pos[0] * intrinsics.focal_length_h;
+    const float x = (xscr+1.)*0.5f*image_size.width;
+    const float y = (yscr+1.)*0.5f*image_size.height;
+    return {x, y};
+}
+
+
+cv::Quatf image_to_world(cv::Quatf q)
+{
+    std::swap(q[1], q[3]);
+    q[1] = -q[1];
+    q[2] = -q[2];
+    q[3] = -q[3];
+    return q;
+}
+
+
+cv::Point2f normalize(const cv::Point2f &p, int h, int w)
+{
+    return {
+        p.x/w*2.f-1.f,
+        p.y/h*2.f-1.f
+    };
+}
+
+
+cv::Quatf rotation_from_two_vectors(const vec3 &a, const vec3 &b)
+{
+    // |axis| = |a| * |b| * sin(alpha)
+    const vec3 axis = a.cross(b);
+    // dot = |a|*|b|*cos(alpha)
+    const float dot = a.dot(b);
+    const float len = cv::norm(axis);
+    vec3 normed_axis = axis / len;
+    float angle = std::atan2(len, dot);
+    if (!(std::isfinite(normed_axis[0]) && std::isfinite(normed_axis[1]) && std::isfinite(normed_axis[2])))
+    {
+        angle = 0.f;
+        normed_axis = vec3{1.,0.,0.};
+    }
+    return cv::Quatf::createFromAngleAxis(angle, normed_axis);
+}
+
+
+// Computes correction due to head being off screen center.
+cv::Quatf compute_rotation_correction(const cv::Point3f& p)
+{
+    return rotation_from_two_vectors(
+        {-1.f,0.f,0.f}, p);
+}
+
+
+// Intersection over union. A value between 0 and 1 which measures the match between the bounding boxes.
+template<class T>
+T iou(const cv::Rect_<T> &a, const cv::Rect_<T> &b)
+{
+    auto i = a & b;
+    return double{i.area()} / (a.area()+b.area()-i.area());
+}
+
+
+class GuardedThreadCountSwitch
+{
+    int old_num_threads_cv_ = 1;
+    int old_num_threads_omp_ = 1;
+    public:
+        GuardedThreadCountSwitch(int num_threads)
+        {
+            old_num_threads_cv_ = cv::getNumThreads();
+            old_num_threads_omp_ = omp_get_num_threads();
+            omp_set_num_threads(num_threads);
+            cv::setNumThreads(num_threads);
+        }
+
+        ~GuardedThreadCountSwitch()
+        {
+            omp_set_num_threads(old_num_threads_omp_);
+            cv::setNumThreads(old_num_threads_cv_);
+        }
+
+        GuardedThreadCountSwitch(const GuardedThreadCountSwitch&) = delete;
+        GuardedThreadCountSwitch& operator=(const GuardedThreadCountSwitch&) = delete;
+};
+
+
+bool NeuralNetTracker::detect()
+{
+    double inference_time = 0.;
+
+    OnScopeExit update_inference_time{ [&]() {
+
+            QMutexLocker lck{ &stats_mtx_ };
+            inference_time_ = inference_time;
+    } };
+
+    // If there is no past ROI from the localizer or if the match of its output
+    // with the current ROI is too poor we have to run it again. This causes a
+    // latency spike of maybe an additional 50%. But it only occurs when the user
+    // moves his head far enough - or when the tracking ist lost ...
+    if (!last_localizer_roi_ || !last_roi_ ||
+        iou(*last_localizer_roi_,*last_roi_)<0.25)
+    {
+        auto [p, rect] = localizer_->run(grayscale_);
+        inference_time += localizer_->last_inference_time_millis();
+
+        if (last_roi_ && iou(rect,*last_roi_)>=0.25 && p > 0.5)
+        {
+            // The new ROI matches the result from tracking, so the user is
+            // still there and to not disturb recurrent models, we only update
+            // ...
+            last_localizer_roi_ = rect;
+        }
+        else if (p > 0.5 && rect.height > 32 && rect.width > 32)
+        {
+            // Tracking probably got lost since the ROI's don't match, but the
+            // localizer still finds a face, so we use the ROI from the localizer
+            last_localizer_roi_ = rect;
+            last_roi_ = rect;
+        }
+        else
+        {
+            // Tracking lost and no localization result. The user probably can't be seen.
+            last_roi_.reset();
+            last_localizer_roi_.reset();
+        }
+    }
+
+    if (!last_roi_)
+    {
+        // Last iteration the tracker failed to generate a trustworthy
+        // roi and the localizer also cannot find a face.
+        draw_gizmos({}, {});
+        return false;
+    }
+
+    auto face = poseestimator_->run(grayscale_, *last_roi_);
+    inference_time += poseestimator_->last_inference_time_millis();
+
+    if (!face)
+    {
+        last_roi_.reset();
+        draw_gizmos({}, {});
+        return false;
+    }
+
+    cv::Rect2f roi = expand(face->box, (float)settings_.roi_zoom);
+
+    last_roi_ = ewa_filter(*last_roi_, roi, float(settings_.roi_filter_alpha));
+
+    QuatPose pose = compute_filtered_pose(*face);
+    last_pose_ = pose;
+
+    Affine pose_affine = {
+        pose.rot.toRotMat3x3(cv::QUAT_ASSUME_UNIT),
+        pose.pos };
+
+    {
+        QMutexLocker lck(&mtx_);
+        last_pose_affine_ = pose_affine;
+    }
+
+    draw_gizmos(*face, last_pose_affine_);
+
+    return true;
+}
+
+
+void NeuralNetTracker::draw_gizmos(
+    const std::optional<PoseEstimator::Face> &face,
+    const Affine& pose)
+{
+    if (!is_visible_)
+        return;
+
+    preview_.draw_gizmos(
+        face,
+        last_roi_,
+        last_localizer_roi_,
+        world_to_image(pose.t, grayscale_.size(), intrinsics_));
+
+    if (settings_.show_network_input)
+    {
+        cv::Mat netinput = poseestimator_->last_network_input();
+        preview_.overlay_netinput(netinput);
+    }
+}
+
+
+QuatPose NeuralNetTracker::transform_to_world_pose(const cv::Quatf &face_rotation, const cv::Point2f& face_xy, const float face_size) const
+{
+    const vec3 face_world_pos = image_to_world(
+        face_xy.x, face_xy.y, face_size, HEAD_SIZE_MM,
+        grayscale_.size(),
+        intrinsics_);
+
+    const cv::Quatf rot_correction = compute_rotation_correction(
+        face_world_pos);
+
+    cv::Quatf rot = rot_correction * image_to_world(face_rotation);
+
+    // But this is in general not the location of the rotation joint in the neck.
+    // So we need an extra offset. Which we determine by computing
+    // z,y,z-pos = head_joint_loc + R_face * offset
+    const vec3 local_offset = vec3{
+        static_cast<float>(settings_.offset_fwd),
+        static_cast<float>(settings_.offset_up),
+        static_cast<float>(settings_.offset_right)};
+    const vec3 offset = rotate(rot, local_offset);
+    const vec3 pos = face_world_pos + offset;
+
+    return { rot, pos };
+}
+
+
+QuatPose NeuralNetTracker::compute_filtered_pose(const PoseEstimator::Face &face)
+{
+    if (fps_ > 0.001 && last_pose_ && poseestimator_->has_uncertainty())
+    {
+        auto image2world = [this](const cv::Quatf &face_rotation, const cv::Point2f& face_xy, const float face_size) {
+                return this->transform_to_world_pose(face_rotation, face_xy, face_size); };
+
+        return apply_filter(
+            face,
+            *last_pose_,
+            1./fps_,
+            std::move(image2world),
+            FiltParams{
+                float(settings_.deadzone_hardness),
+                float(settings_.deadzone_size)
+            });
+    }
+    else
+    {
+        return transform_to_world_pose(face.rotation, face.center, face.size);
+    }
+}
+
+
+NeuralNetTracker::NeuralNetTracker()
+{
+    opencv_init();
+    neuralnet_tracker_tests::run();
+}
+
+
+NeuralNetTracker::~NeuralNetTracker()
+{
+    requestInterruption();
+    wait();
+    // fast start/stop causes breakage
+    portable::sleep(1000);
+}
+
+
+module_status NeuralNetTracker::start_tracker(QFrame* videoframe)
+{
+    videoframe->show();
+    video_widget_ = std::make_unique<cv_video_widget>(videoframe);
+    layout_ = std::make_unique<QHBoxLayout>();
+    layout_->setContentsMargins(0, 0, 0, 0);
+    layout_->addWidget(&*video_widget_);
+    videoframe->setLayout(&*layout_);
+    video_widget_->show();
+    num_threads_ = settings_.num_threads;
+    start();
+    return status_ok();
+}
+
+
+bool NeuralNetTracker::load_and_initialize_model()
+{
+    const QString localizer_model_path_enc =
+        OPENTRACK_BASE_PATH+"/" OPENTRACK_LIBRARY_PATH "/models/head-localizer.onnx";
+    const QString poseestimator_model_path_enc = get_posenet_filename();
+
+    try
+    {
+        env_ = Ort::Env{
+            OrtLoggingLevel::ORT_LOGGING_LEVEL_ERROR,
+            "tracker-neuralnet"
+        };
+        auto opts = Ort::SessionOptions{};
+        // Do thread settings here do anything?
+        // There is a warning which says to control number of threads via
+        // openmp settings. Which is what we do.
+        opts.SetIntraOpNumThreads(num_threads_);
+        opts.SetInterOpNumThreads(1);
+        allocator_info_ = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
+
+        localizer_.emplace(
+            allocator_info_,
+            Ort::Session{env_, convert(localizer_model_path_enc).c_str(), opts});
+
+        qDebug() << "Loading pose net " << poseestimator_model_path_enc;
+        poseestimator_.emplace(
+            allocator_info_,
+            Ort::Session{env_, convert(poseestimator_model_path_enc).c_str(), opts});
+    }
+    catch (const Ort::Exception &e)
+    {
+        qDebug() << "Failed to initialize the neural network models. ONNX error message: "
+            << e.what();
+        return false;
+    }
+    catch (const std::exception &e)
+    {
+        qDebug() << "Failed to initialize the neural network models. Error message: " << e.what();
+        return false;
+    }
+
+    return true;
+}
+
+
+bool NeuralNetTracker::open_camera()
+{
+    int rint = std::clamp(*settings_.resolution, 0, (int)std::size(resolution_choices)-1);
+    resolution_tuple res = resolution_choices[rint];
+    int fps = enum_to_fps(settings_.force_fps);
+
+    QMutexLocker l(&camera_mtx_);
+
+    camera_ = video::make_camera(settings_.camera_name);
+
+    if (!camera_)
+        return false;
+
+    video::impl::camera::info args {};
+
+    if (res.width)
+    {
+        args.width = res.width;
+        args.height = res.height;
+    }
+    if (fps)
+        args.fps = fps;
+
+    args.use_mjpeg = settings_.use_mjpeg;
+
+    if (!camera_->start(args))
+    {
+        qDebug() << "neuralnet tracker: can't open camera";
+        return false;
+    }
+
+    return true;
+}
+
+
+void NeuralNetTracker::run()
+{
+    preview_.init(*video_widget_);
+
+    GuardedThreadCountSwitch switch_num_threads_to(num_threads_);
+
+    if (!open_camera())
+        return;
+
+    if (!load_and_initialize_model())
+        return;
+
+    std::chrono::high_resolution_clock clk;
+
+    while (!isInterruptionRequested())
+    {
+        is_visible_ = check_is_visible();
+        auto t = clk.now();
+        {
+            QMutexLocker l(&camera_mtx_);
+
+            auto [ img, res ] = camera_->get_frame();
+
+            if (!res)
+            {
+                l.unlock();
+                portable::sleep(100);
+                continue;
+            }
+
+            {
+                QMutexLocker lck{&stats_mtx_};
+                resolution_ = { img.width, img.height };
+            }
+
+            auto color = prepare_input_image(img);
+
+            if (is_visible_)
+                preview_.copy_video_frame(color);
+
+            switch (img.channels)
+            {
+            case 1:
+                grayscale_.create(img.height, img.width, CV_8UC1);
+                color.copyTo(grayscale_);
+                break;
+            case 3:
+                cv::cvtColor(color, grayscale_, cv::COLOR_BGR2GRAY);
+                break;
+            default:
+                qDebug() << "Can't handle" << img.channels << "color channels";
+                return;
+            }
+        }
+
+        intrinsics_ = make_intrinsics(grayscale_, settings_);
+
+        detect();
+
+        if (is_visible_)
+            preview_.copy_to_widget(*video_widget_);
+
+        update_fps(
+            std::chrono::duration_cast<std::chrono::milliseconds>(
+                clk.now() - t).count()*1.e-3);
+    }
+}
+
+
+cv::Mat NeuralNetTracker::prepare_input_image(const video::frame& frame)
+{
+    auto img = cv::Mat(frame.height, frame.width, CV_8UC(frame.channels), (void*)frame.data, frame.stride);
+
+    // Crop if aspect ratio is not 4:3
+    if (img.rows*4 != img.cols*3)
+    {
+        img = img(make_crop_rect_for_aspect(img.size(), 4, 3));
+    }
+
+    img = img(make_crop_rect_multiple_of(img.size(), 4));
+
+    if (img.cols > 640)
+    {
+        cv::pyrDown(img, downsized_original_images_[0]);
+        img = downsized_original_images_[0];
+    }
+    if (img.cols > 640)
+    {
+        cv::pyrDown(img, downsized_original_images_[1]);
+        img = downsized_original_images_[1];
+    }
+
+    return img;
+}
+
+
+void NeuralNetTracker::update_fps(double dt)
+{
+    const double alpha = dt/(dt + RC);
+    if (dt > 1e-6)
+    {
+        QMutexLocker lck{&stats_mtx_};
+        fps_ *= 1 - alpha;
+        fps_ += alpha * 1./dt;
+    }
+}
+
+
+void NeuralNetTracker::data(double *data)
+{
+    Affine tmp = [&]()
+    {
+        QMutexLocker lck(&mtx_);
+        return last_pose_affine_;
+    }();
+
+    const auto& mx = tmp.R.col(0);
+    const auto& my = tmp.R.col(1);
+    const auto& mz = tmp.R.col(2);
+
+    // For reference: https://en.wikipedia.org/wiki/Euler_angles. Section "Rotation matrix". The relevant matrix is 
+    // under "Tait-Bryan angles", row with "Y_alpha Z_beta X_gamma = ...". 
+    // Because for the NN tracker x is forward, and y is up. We can see that the x axis is independent of roll. Thus it
+    // is relatively easy to figure out the yaw and pitch angles (alpha and beta).
+    const float yaw = std::atan2(mx(2), mx(0));
+    const float pitch = -std::atan2(-mx(1), std::sqrt(mx(2)*mx(2)+mx(0)*mx(0)));
+    // For the roll angle we recognize that the matrix entries in the second row contain cos(pitch)*cos(roll), and
+    // cos(pitch)*sin(roll). Using atan2 eliminates the common pitch factor and we obtain the roll angle.
+    const float roll = std::atan2(-mz(1), my(1));
+    {
+        constexpr double rad2deg = 180/M_PI;
+        data[Yaw]   = rad2deg * yaw;
+        data[Pitch] = rad2deg * pitch;
+        data[Roll]  = -rad2deg * roll;
+
+        // convert to cm
+        data[TX] = -tmp.t[2] * 0.1;
+        data[TY] = tmp.t[1] * 0.1;
+        data[TZ] = -tmp.t[0] * 0.1;
+    }
+}
+
+
+Affine NeuralNetTracker::pose()
+{
+    QMutexLocker lck(&mtx_);
+    return last_pose_affine_;
+}
+
+
+std::tuple<cv::Size,double, double> NeuralNetTracker::stats() const
+{
+    QMutexLocker lck(&stats_mtx_);
+    return { resolution_, fps_, inference_time_ };
+}
+
+
+QString NeuralNetTracker::get_posenet_filename() const
+{
+    QString filename = settings_.posenet_file;
+    if (QFileInfo(filename).isRelative())
+        filename = get_default_model_directory().absoluteFilePath(filename);
+    return filename;
+}
+
+
+void NeuralNetDialog::make_fps_combobox()
+{
+    for (int k = 0; k < fps_MAX; k++)
+    {
+        const int hz = enum_to_fps(k);
+        const QString name = (hz == 0) ? tr("Default") : QString::number(hz);
+        ui_.cameraFPS->addItem(name, k);
+    }
+}
+
+void NeuralNetDialog::make_resolution_combobox()
+{
+    int k=0;
+    for (const auto [w, h] : resolution_choices)
+    {
+        const QString s = (w == 0)
+            ? tr("Default")
+            : QString::number(w) + " x " + QString::number(h);
+        ui_.resolution->addItem(s, k++);
+    }
+}
+
+
+NeuralNetDialog::NeuralNetDialog() :
+    trans_calib_(1, 2)
+{
+    ui_.setupUi(this);
+
+    make_fps_combobox();
+    make_resolution_combobox();
+
+    for (const auto& str : video::camera_names())
+        ui_.cameraName->addItem(str);
+
+    tie_setting(settings_.camera_name, ui_.cameraName);
+    tie_setting(settings_.fov, ui_.cameraFOV);
+    tie_setting(settings_.offset_fwd, ui_.tx_spin);
+    tie_setting(settings_.offset_up, ui_.ty_spin);
+    tie_setting(settings_.offset_right, ui_.tz_spin);
+    tie_setting(settings_.show_network_input, ui_.showNetworkInput);
+    tie_setting(settings_.roi_filter_alpha, ui_.roiFilterAlpha);
+    tie_setting(settings_.use_mjpeg, ui_.use_mjpeg);
+	tie_setting(settings_.roi_zoom, ui_.roiZoom);
+    tie_setting(settings_.num_threads, ui_.threadCount);
+    tie_setting(settings_.resolution, ui_.resolution);
+    tie_setting(settings_.force_fps, ui_.cameraFPS);
+    tie_setting(settings_.posenet_file, ui_.posenetFileDisplay);
+
+    connect(ui_.buttonBox, SIGNAL(accepted()), this, SLOT(doOK()));
+    connect(ui_.buttonBox, SIGNAL(rejected()), this, SLOT(doCancel()));
+    connect(ui_.camera_settings, SIGNAL(clicked()), this, SLOT(camera_settings()));
+    connect(ui_.posenetSelectButton, SIGNAL(clicked()), this, SLOT(onSelectPoseNetFile()));
+    connect(&settings_.camera_name, value_::value_changed<QString>(), this, &NeuralNetDialog::update_camera_settings_state);
+
+    update_camera_settings_state(settings_.camera_name);
+
+    connect(&calib_timer_, &QTimer::timeout, this, &NeuralNetDialog::trans_calib_step);
+    calib_timer_.setInterval(35);
+    connect(ui_.tcalib_button,SIGNAL(toggled(bool)), this, SLOT(startstop_trans_calib(bool)));
+
+    connect(&tracker_status_poll_timer_, &QTimer::timeout, this, &NeuralNetDialog::status_poll);
+    tracker_status_poll_timer_.setInterval(250);
+    tracker_status_poll_timer_.start();
+}
+
+void NeuralNetDialog::save()
+{
+    settings_.b->save();
+}
+
+void NeuralNetDialog::reload()
+{
+    settings_.b->reload();
+}
+
+void NeuralNetDialog::doOK()
+{
+    save();
+    close();
+}
+
+
+void NeuralNetDialog::doCancel()
+{
+    close();
+}
+
+
+void NeuralNetDialog::camera_settings()
+{
+    if (tracker_)
+    {
+        QMutexLocker l(&tracker_->camera_mtx_);
+        (void)tracker_->camera_->show_dialog();
+    }
+    else
+        (void)video::show_dialog(settings_.camera_name);
+}
+
+
+void NeuralNetDialog::update_camera_settings_state(const QString& name)
+{
+    (void)name;
+    ui_.camera_settings->setEnabled(true);
+}
+
+
+void NeuralNetDialog::register_tracker(ITracker * x)
+{
+    tracker_ = static_cast<NeuralNetTracker*>(x);
+    ui_.tcalib_button->setEnabled(true);
+}
+
+
+void NeuralNetDialog::unregister_tracker()
+{
+    tracker_ = nullptr;
+    ui_.tcalib_button->setEnabled(false);
+}
+
+bool NeuralNetDialog::embeddable() noexcept
+{
+    return true;
+}
+
+void NeuralNetDialog::set_buttons_visible(bool x)
+{
+    ui_.buttonBox->setVisible(x);
+}
+
+void NeuralNetDialog::status_poll()
+{
+    QString status;
+    if (!tracker_)
+    {
+        status = tr("Tracker Offline");
+    }
+    else
+    {
+        auto [ res, fps, inference_time ] = tracker_->stats();
+        status = tr("%1x%2 @ %3 FPS / Inference: %4 ms").arg(res.width).arg(res.height).arg(int(fps)).arg(inference_time, 0, 'f', 1);
+    }
+    ui_.resolution_display->setText(status);
+}
+
+
+void NeuralNetDialog::trans_calib_step()
+{
+    if (tracker_)
+    {
+        const Affine X_CM = [&]() {
+            QMutexLocker l(&calibrator_mutex_);
+            return tracker_->pose();
+        }();
+        trans_calib_.update(X_CM.R, X_CM.t);
+        auto [_, nsamples] = trans_calib_.get_estimate();
+
+        constexpr int min_yaw_samples = 15;
+        constexpr int min_pitch_samples = 12;
+        constexpr int min_samples = min_yaw_samples+min_pitch_samples;
+
+        // Don't bother counting roll samples. Roll calibration is hard enough
+        // that it's a hidden unsupported feature anyway.
+
+        QString sample_feedback;
+        if (nsamples[0] < min_yaw_samples)
+            sample_feedback = tr("%1 yaw samples. Yaw more to %2 samples for stable calibration.").arg(nsamples[0]).arg(min_yaw_samples);
+        else if (nsamples[1] < min_pitch_samples)
+            sample_feedback = tr("%1 pitch samples. Pitch more to %2 samples for stable calibration.").arg(nsamples[1]).arg(min_pitch_samples);
+        else
+        {
+            const int nsamples_total = nsamples[0] + nsamples[1];
+            sample_feedback = tr("%1 samples. Over %2, good!").arg(nsamples_total).arg(min_samples);
+        }
+        ui_.sample_count_display->setText(sample_feedback);
+    }
+    else
+        startstop_trans_calib(false);
+}
+
+
+void NeuralNetDialog::startstop_trans_calib(bool start)
+{
+    QMutexLocker l(&calibrator_mutex_);
+    // FIXME: does not work ...
+    if (start)
+    {
+        qDebug() << "pt: starting translation calibration";
+        calib_timer_.start();
+        trans_calib_.reset();
+        ui_.sample_count_display->setText(QString());
+        // Tracker must run with zero'ed offset for calibration.
+        settings_.offset_fwd = 0;
+        settings_.offset_up = 0;
+        settings_.offset_right = 0;
+    }
+    else
+    {
+        calib_timer_.stop();
+        qDebug() << "pt: stopping translation calibration";
+        {
+            auto [tmp, nsamples] = trans_calib_.get_estimate();
+            settings_.offset_fwd = int(tmp[0]);
+            settings_.offset_up = int(tmp[1]);
+            settings_.offset_right = int(tmp[2]);
+        }
+    }
+    ui_.tx_spin->setEnabled(!start);
+    ui_.ty_spin->setEnabled(!start);
+    ui_.tz_spin->setEnabled(!start);
+
+    if (start)
+        ui_.tcalib_button->setText(tr("Stop calibration"));
+    else
+        ui_.tcalib_button->setText(tr("Start calibration"));
+}
+
+
+void NeuralNetDialog::onSelectPoseNetFile()
+{
+    const auto root = get_default_model_directory();
+    // Start with the current setting
+    QString filename = settings_.posenet_file;
+    // If the filename is relative then assume that the file is located under the
+    // model directory. Under regular use this should always be the case.
+    if (QFileInfo(filename).isRelative())
+        filename = root.absoluteFilePath(filename);
+    filename = QFileDialog::getOpenFileName(this,
+        tr("Select Pose Net ONNX"), filename, tr("ONNX Files (*.onnx)"));
+    // In case the user aborted.
+    if (filename.isEmpty())
+        return;
+    // When a file under the model directory was selected we can get rid of the
+    // directory prefix. This is more robust than storing absolute paths, e.g.
+    // in case the user moves the opentrack install folder / reuses old settings.
+    // When the file is not in the model directory, we have to use the absolute path,
+    // which is also fine as developer feature.
+    if (filename.startsWith(root.absolutePath()))
+        filename = root.relativeFilePath(filename);
+    settings_.posenet_file = filename;
+}
+
+
+Settings::Settings() : opts("neuralnet-tracker") {}
+
+} // neuralnet_tracker_ns
+
+OPENTRACK_DECLARE_TRACKER(NeuralNetTracker, NeuralNetDialog, NeuralNetMetadata)
diff --git a/tracker-neuralnet/ftnoir_tracker_neuralnet.h b/tracker-neuralnet/ftnoir_tracker_neuralnet.h
new file mode 100644
index 00000000..fe755f51
--- /dev/null
+++ b/tracker-neuralnet/ftnoir_tracker_neuralnet.h
@@ -0,0 +1,230 @@
+/* Copyright (c) 2021 Michael Welter <michael@welter-4d.de>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ */
+
+#pragma once
+
+#include "ui_neuralnet-trackercontrols.h"
+#include "model_adapters.h"
+#include "deadzone_filter.h"
+#include "preview.h"
+
+#include "options/options.hpp"
+#include "api/plugin-api.hpp"
+#include "cv/video-widget.hpp"
+#include "cv/translation-calibrator.hpp"
+#include "cv/numeric.hpp"
+#include "compat/timer.hpp"
+#include "video/camera.hpp"
+#include "cv/affine.hpp"
+
+#include <QObject>
+#include <QThread>
+#include <QMutex>
+#include <QHBoxLayout>
+#include <QDialog>
+#include <QTimer>
+
+#include <memory>
+#include <cinttypes>
+#include <array>
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+
+
+namespace neuralnet_tracker_ns
+{
+
+
+using namespace options;
+
+
+enum fps_choices
+{
+    fps_default = 0,
+    fps_30      = 1,
+    fps_60      = 2,
+    fps_75      = 3,
+    fps_125     = 4,
+    fps_200     = 5,
+    fps_50      = 6,
+    fps_100     = 7,
+    fps_120     = 8,
+    fps_300     = 9,
+    fps_250     = 10,
+    fps_MAX     = 11,
+};
+
+struct resolution_tuple
+{
+    int width;
+    int height;
+};
+
+static const std::array<resolution_tuple, 7> resolution_choices =
+{{
+    { 320, 240 },
+    { 640, 480 },
+    { 800, 600 },
+    { 1024, 768 },
+    { 1280, 720 },
+    { 1920, 1080},
+    { 0, 0 }
+}};
+
+
+struct Settings : opts {
+    value<int> offset_fwd { b, "offset-fwd", 200 }, // Millimeters
+               offset_up { b, "offset-up", 0 },
+               offset_right { b, "offset-right", 0 };
+    value<QString> camera_name { b, "camera-name", ""};
+    value<int> fov { b, "field-of-view", 56 };
+    value<fps_choices> force_fps { b, "force-fps", fps_default };
+    value<bool> show_network_input { b, "show-network-input", false };
+    value<double> roi_filter_alpha{ b, "roi-filter-alpha", 1. };
+    value<double> roi_zoom{ b, "roi-zoom", 1. };
+    value<bool> use_mjpeg { b, "use-mjpeg", false };
+    value<int> num_threads { b, "num-threads", 1 };
+    value<int> resolution { b, "force-resolution", 0 };
+    value<double> deadzone_size { b, "deadzone-size", 1. };
+    value<double> deadzone_hardness { b, "deadzone-hardness", 1.5 };
+    value<QString> posenet_file { b, "posenet-file", "head-pose-0.3-big-quantized.onnx" };
+    Settings();
+};
+
+
+struct CamIntrinsics
+{
+    float focal_length_w;
+    float focal_length_h;
+    float fov_w;
+    float fov_h;
+};
+
+
+class NeuralNetTracker : protected virtual QThread, public ITracker
+{
+    Q_OBJECT
+public:
+    NeuralNetTracker();
+    ~NeuralNetTracker() override;
+    module_status start_tracker(QFrame* frame) override;
+    void data(double *data) override;
+    void run() override;
+    Affine pose();
+    std::tuple<cv::Size, double, double> stats() const;
+
+    QMutex camera_mtx_;
+    std::unique_ptr<video::impl::camera> camera_;
+
+private:
+    bool detect();
+    bool open_camera();
+    void set_intrinsics();
+    cv::Mat prepare_input_image(const video::frame& frame);
+    bool load_and_initialize_model();
+    void draw_gizmos(
+        const std::optional<PoseEstimator::Face> &face,
+        const Affine& pose);
+    void update_fps(double dt);
+    // Secretly applies filtering while computing the pose in 3d space.
+    QuatPose compute_filtered_pose(const PoseEstimator::Face &face);
+    // Compute the pose in 3d space taking the network outputs
+    QuatPose transform_to_world_pose(const cv::Quatf &face_rotation, const cv::Point2f& face_xy, const float face_size) const;
+    QString get_posenet_filename() const;
+
+    Settings settings_;
+    std::optional<Localizer> localizer_;
+    std::optional<PoseEstimator> poseestimator_;
+    Ort::Env env_{nullptr};
+    Ort::MemoryInfo allocator_info_{nullptr};
+
+    CamIntrinsics intrinsics_{};
+    cv::Mat grayscale_;
+    std::array<cv::Mat,2> downsized_original_images_ = {}; // Image pyramid
+    std::optional<cv::Rect2f> last_localizer_roi_;
+    std::optional<cv::Rect2f> last_roi_;
+    static constexpr float HEAD_SIZE_MM = 200.f; // In the vertical. Approximately.
+
+    mutable QMutex stats_mtx_;
+    double fps_ = 0;
+    double inference_time_ = 0;
+    cv::Size resolution_ = {};
+    
+    static constexpr double RC = .25;
+    int num_threads_ = 1;
+    bool is_visible_ = true;
+
+    QMutex mtx_ = {}; // Protects the pose
+    std::optional<QuatPose> last_pose_ = {};
+    Affine last_pose_affine_ = {};
+
+    Preview preview_;
+    std::unique_ptr<cv_video_widget> video_widget_;
+    std::unique_ptr<QHBoxLayout> layout_;
+};
+
+
+class NeuralNetDialog : public ITrackerDialog
+{
+    Q_OBJECT
+public:
+    NeuralNetDialog();
+    void register_tracker(ITracker * x) override;
+    void unregister_tracker() override;
+
+    bool embeddable() noexcept override;
+    void set_buttons_visible(bool x) override;
+private:
+    void make_fps_combobox();
+    void make_resolution_combobox();
+
+    Ui::Form ui_;
+    Settings settings_;
+    // Calibration code mostly taken from point tracker
+    QTimer calib_timer_;
+    TranslationCalibrator trans_calib_;
+    QMutex calibrator_mutex_;
+    QTimer tracker_status_poll_timer_;
+    NeuralNetTracker* tracker_ = nullptr;
+
+private Q_SLOTS:
+    void save() override;
+    void reload() override;
+    void doOK();
+    void doCancel();
+    void camera_settings();
+    void update_camera_settings_state(const QString& name);
+    void startstop_trans_calib(bool start);
+    void trans_calib_step();
+    void status_poll();
+    void onSelectPoseNetFile();
+};
+
+
+class NeuralNetMetadata : public Metadata
+{
+    Q_OBJECT
+    QString name() override { return QString("neuralnet tracker"); }
+    QIcon icon() override { return QIcon(":/images/neuralnet.png"); }
+};
+
+
+} // neuralnet_tracker_ns
+
+
+namespace neuralnet_tracker_tests
+{
+
+void run();
+
+}
+
+
+using neuralnet_tracker_ns::NeuralNetTracker;
+using neuralnet_tracker_ns::NeuralNetDialog;
+using neuralnet_tracker_ns::NeuralNetMetadata;
diff --git a/tracker-neuralnet/images/neuralnet.png b/tracker-neuralnet/images/neuralnet.png
new file mode 100644
index 00000000..1a10c53c
--- /dev/null
+++ b/tracker-neuralnet/images/neuralnet.png
diff --git a/tracker-neuralnet/lang/de_DE.ts b/tracker-neuralnet/lang/de_DE.ts
new file mode 100644
index 00000000..6261eec0
--- /dev/null
+++ b/tracker-neuralnet/lang/de_DE.ts
@@ -0,0 +1,172 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE TS>
+<TS version="2.1" language="de_DE">
+<context>
+    <name>Form</name>
+    <message>
+        <source>Tracker settings</source>
+        <translation>Tracker-Einstellungen</translation>
+    </message>
+    <message>
+        <source>Head Center Offset</source>
+        <translation>Versatz zur Kopfmitte</translation>
+    </message>
+    <message>
+        <source>Right</source>
+        <translation>Rechts</translation>
+    </message>
+    <message>
+        <source>Forward</source>
+        <translation>Vorwärts</translation>
+    </message>
+    <message>
+        <source>Up</source>
+        <translation>Hoch</translation>
+    </message>
+    <message>
+        <source> mm</source>
+        <translation> mm</translation>
+    </message>
+    <message>
+        <source>Use only yaw and pitch while calibrating.
+Don&apos;t roll or change position.</source>
+        <translation>Während der Kalibrierung nur gieren und nicken.
+Bitte nicht rollen oder die Position ändern.</translation>
+    </message>
+    <message>
+        <source>Start calibration</source>
+        <translation>Kalibrierung starten</translation>
+    </message>
+    <message>
+        <source>Camera Configuration</source>
+        <translation>Kamera-Konfiguration</translation>
+    </message>
+    <message>
+        <source>Diagonal FOV</source>
+        <translation>Diagonales Sichtfeld</translation>
+    </message>
+    <message>
+        <source>Camera name</source>
+        <translation>Kamera-Name</translation>
+    </message>
+    <message>
+        <source>Field of view. Needed to transform the pose to world coordinates.</source>
+        <translation>Sichtfeld. Benötigt, um die Pose in Welt-Koordinaten zu übersetzen.</translation>
+    </message>
+    <message>
+        <source>The requested resolution for cases where the camera delivers maximum frame rate only for a particular resolution. The image may still be downscaled to the internal resolution.</source>
+        <translation>Die angeforderte Auflösung für Fälle, in denen die Kamera die maximale Bildrate nur bei bestimmten Auflösungen ausgibt. Das Bild wird möglicherweise weiterhin herunterskaliert auf die interne Auflösung.</translation>
+    </message>
+    <message>
+        <source>Resolution</source>
+        <translation>Auflösung</translation>
+    </message>
+    <message>
+        <source>Requested video frame rate. Actual setting may not be supported by the camera.</source>
+        <translation>Angeforderte Bildrate. Die tatsächliche Einstellungen wird von der Kamera möglicherweise nicht unterstützt.</translation>
+    </message>
+    <message>
+        <source>Frames per second</source>
+        <translation>Bilder pro Sekunde</translation>
+    </message>
+    <message>
+        <source>MJPEG</source>
+        <translation>MJPEG</translation>
+    </message>
+    <message>
+        <source>Camera settings</source>
+        <translation>Kamera-Einstellungen</translation>
+    </message>
+    <message>
+        <source>Tuning / Debug</source>
+        <translation>Tuning / Fehlersuche</translation>
+    </message>
+    <message>
+        <source>Thread Count</source>
+        <translation>Anzahl der Threads</translation>
+    </message>
+    <message>
+        <source>Number of threads. Can be used to balance the CPU load between the game and the tracker.</source>
+        <translation>Anzahl der Threads. Kann verwendet werden, um die CPU-Last zwischen Spiel und Tracker zu balancieren.</translation>
+    </message>
+    <message>
+        <source>Show the image patch that the pose estimation model sees.</source>
+        <translation>Zeigt den Bildausschnitt, den das Modell zur Posenabschätzung sieht.</translation>
+    </message>
+    <message>
+        <source>Show Network Input</source>
+        <translation>Zeige Netzwerk-Eingabe</translation>
+    </message>
+    <message>
+        <source>ROI Smoothing Alpha</source>
+        <translation>ROI-Glättungsalpha</translation>
+    </message>
+    <message>
+        <source>Amount of smoothing of the face region coordinates. Can help stabilize the pose.</source>
+        <translation>Umfang der Glättung der Gesichtkoordinaten. Kann helfen, die Pose zu stabilisieren.</translation>
+    </message>
+    <message>
+        <source>ROI Zoom</source>
+        <translation>ROI-Zoom</translation>
+    </message>
+    <message>
+        <source>Zoom factor for the face region. Applied before the patch is fed into the pose estimation model. There is a sweet spot near 1.</source>
+        <translation>Zoom-Faktor der Gesichtsregion. Wird angewendet, bevor der Bildausschnitt zum Posen-Abschätzungsmodell gesendet wird. Der Sweet-Spot liegt nahe bei 1.</translation>
+    </message>
+    <message>
+        <source>Select the pose network. Changes take affect on the next tracker start</source>
+        <translation>Wählt das Pose-Netzwerk. Die Änderungen treten beim nächsten Start des Trackers inkraft</translation>
+    </message>
+    <message>
+        <source>Select Pose Net ONNX</source>
+        <translation>Wähle Pose-Netzwerk ONNX</translation>
+    </message>
+    <message>
+        <source>&lt;the pose net file&gt;</source>
+        <translation>&lt;die pose netzwerk datei&gt;</translation>
+    </message>
+</context>
+<context>
+    <name>neuralnet_tracker_ns::NeuralNetDialog</name>
+    <message>
+        <source>Default</source>
+        <translation>Standard</translation>
+    </message>
+    <message>
+        <source>Tracker Offline</source>
+        <translation>Tracker offline</translation>
+    </message>
+    <message>
+        <source>%1x%2 @ %3 FPS / Inference: %4 ms</source>
+        <translation>%1x%2 @ %3 FPS / Inferenz: %4 ms</translation>
+    </message>
+    <message>
+        <source>%1 yaw samples. Yaw more to %2 samples for stable calibration.</source>
+        <translation>%1 Gieren-Proben. Weiterhin gieren bis %2 Proben für eine stabile Kalibrierung.</translation>
+    </message>
+    <message>
+        <source>%1 pitch samples. Pitch more to %2 samples for stable calibration.</source>
+        <translation>%1 Nicken-Proben. Weiterhin nicken bis %2 Proben für eine stabile Kalibrierung.</translation>
+    </message>
+    <message>
+        <source>%1 samples. Over %2, good!</source>
+        <translation>%1 Proben. Mehr als %2, gut!</translation>
+    </message>
+    <message>
+        <source>Stop calibration</source>
+        <translation>Kalibrierung stoppen</translation>
+    </message>
+    <message>
+        <source>Start calibration</source>
+        <translation>Kalibrierung starten</translation>
+    </message>
+    <message>
+        <source>Select Pose Net ONNX</source>
+        <translation>Wähle Pose-Netzwerk ONNX</translation>
+    </message>
+    <message>
+        <source>ONNX Files (*.onnx)</source>
+        <translation>ONNX-Dateien (*.onnx)</translation>
+    </message>
+</context>
+</TS>
diff --git a/tracker-neuralnet/lang/nl_NL.ts b/tracker-neuralnet/lang/nl_NL.ts
new file mode 100644
index 00000000..27da4f5a
--- /dev/null
+++ b/tracker-neuralnet/lang/nl_NL.ts
@@ -0,0 +1,171 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE TS>
+<TS version="2.1" language="nl_NL">
+<context>
+    <name>Form</name>
+    <message>
+        <source>Tracker settings</source>
+        <translation>Tracker-instellingen</translation>
+    </message>
+    <message>
+        <source>Frames per second</source>
+        <translation>Frames per seconde</translation>
+    </message>
+    <message>
+        <source>Camera name</source>
+        <translation>Cameranaam</translation>
+    </message>
+    <message>
+        <source>Diagonal FOV</source>
+        <translation>Diagonale FOV</translation>
+    </message>
+    <message>
+        <source>Camera settings</source>
+        <translation>Camera-instellingen</translation>
+    </message>
+    <message>
+        <source>Camera Configuration</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Head Center Offset</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source> mm</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Use only yaw and pitch while calibrating.
+Don&apos;t roll or change position.</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Start calibration</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Right</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Forward</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Up</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Show Network Input</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>MJPEG</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Tuning / Debug</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>ROI Smoothing Alpha</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>ROI Zoom</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Thread Count</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Resolution</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Field of view. Needed to transform the pose to world coordinates.</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Requested video frame rate. Actual setting may not be supported by the camera.</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>The requested resolution for cases where the camera delivers maximum frame rate only for a particular resolution. The image may still be downscaled to the internal resolution.</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Number of threads. Can be used to balance the CPU load between the game and the tracker.</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Show the image patch that the pose estimation model sees.</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Amount of smoothing of the face region coordinates. Can help stabilize the pose.</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Zoom factor for the face region. Applied before the patch is fed into the pose estimation model. There is a sweet spot near 1.</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Select Pose Net ONNX</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>&lt;the pose net file&gt;</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Select the pose network. Changes take affect on the next tracker start</source>
+        <translation type="unfinished"></translation>
+    </message>
+</context>
+<context>
+    <name>neuralnet_tracker_ns::NeuralNetDialog</name>
+    <message>
+        <source>Default</source>
+        <translation type="unfinished">Standaard</translation>
+    </message>
+    <message>
+        <source>Tracker Offline</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>%1x%2 @ %3 FPS / Inference: %4 ms</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>%1 yaw samples. Yaw more to %2 samples for stable calibration.</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>%1 pitch samples. Pitch more to %2 samples for stable calibration.</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>%1 samples. Over %2, good!</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Stop calibration</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Start calibration</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Select Pose Net ONNX</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>ONNX Files (*.onnx)</source>
+        <translation type="unfinished"></translation>
+    </message>
+</context>
+</TS>
diff --git a/tracker-neuralnet/lang/ru_RU.ts b/tracker-neuralnet/lang/ru_RU.ts
new file mode 100644
index 00000000..c32d4fa7
--- /dev/null
+++ b/tracker-neuralnet/lang/ru_RU.ts
@@ -0,0 +1,174 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE TS>
+<TS version="2.1" language="ru_RU">
+<context>
+    <name>Form</name>
+    <message>
+        <source>Tracker settings</source>
+        <translation>Настройки трекера</translation>
+    </message>
+    <message>
+        <source>Diagonal FOV</source>
+        <translation>Угол обзора</translation>
+    </message>
+    <message>
+        <source>Camera settings</source>
+        <translation>Настройки камеры</translation>
+    </message>
+    <message>
+        <source>Frames per second</source>
+        <translation>Кадры в секунду</translation>
+    </message>
+    <message>
+        <source>Camera name</source>
+        <translation>Камера</translation>
+    </message>
+    <message>
+        <source>Camera Configuration</source>
+        <translation>Конфигурация камеры</translation>
+    </message>
+    <message>
+        <source>Head Center Offset</source>
+        <translation>Смещение центра головы</translation>
+    </message>
+    <message>
+        <source> mm</source>
+        <translation> мм</translation>
+    </message>
+    <message>
+        <source>Use only yaw and pitch while calibrating.
+Don&apos;t roll or change position.</source>
+        <translation>Поворачивайте голову влево-вправо и наклоняйте вверх-вниз.
+Не наклоняйте набок и не смещайте голову в сторону.</translation>
+    </message>
+    <message>
+        <source>Start calibration</source>
+        <translation>Начать калибровку</translation>
+    </message>
+    <message>
+        <source>Right</source>
+        <translation>Вправо</translation>
+    </message>
+    <message>
+        <source>Forward</source>
+        <translation>Вперед</translation>
+    </message>
+    <message>
+        <source>Up</source>
+        <translation>Вверх</translation>
+    </message>
+    <message>
+        <source>Show Network Input</source>
+        <translation>Показать входные данные</translation>
+    </message>
+    <message>
+        <source>MJPEG</source>
+        <translation>Использовать MJPEG</translation>
+    </message>
+    <message>
+        <source>Tuning / Debug</source>
+        <translation>Тонкая настройка</translation>
+    </message>
+    <message>
+        <source>ROI Smoothing Alpha</source>
+        <translation>Сглаживание ROI</translation>
+    </message>
+    <message>
+        <source>ROI Zoom</source>
+        <translation>Масштабирование ROI</translation>
+    </message>
+    <message>
+        <source>Thread Count</source>
+        <translation>Количество потоков</translation>
+    </message>
+    <message>
+        <source>Resolution</source>
+        <translation>Разрешение</translation>
+    </message>
+    <message>
+        <source>Field of view. Needed to transform the pose to world coordinates.</source>
+        <translation>Угол обзора камеры. Требуется для преобразования положения головы в глобальные координаты</translation>
+    </message>
+    <message>
+        <source>Requested video frame rate. Actual setting may not be supported by the camera.</source>
+        <translation>Частота кадров. Реальные значения могут не поддерживаться камерой.</translation>
+    </message>
+    <message>
+        <source>The requested resolution for cases where the camera delivers maximum frame rate only for a particular resolution. The image may still be downscaled to the internal resolution.</source>
+        <translation>Разрешение камеры, для тех случаев, когда быстродействие камеры максимально в определенном разрешении. Может быть масштабировано до внутреннего разрешения.</translation>
+    </message>
+    <message>
+        <source>Number of threads. Can be used to balance the CPU load between the game and the tracker.</source>
+        <translation>Количество потоков. Используется для балансировки нагрузки на процессор между игрой и трекером.</translation>
+    </message>
+    <message>
+        <source>Show the image patch that the pose estimation model sees.</source>
+        <translation>Показать изображение, используемое моделью определения позиции</translation>
+    </message>
+    <message>
+        <source>Amount of smoothing of the face region coordinates. Can help stabilize the pose.</source>
+        <translation>Сглаживание координат области лица. Может помочь стабилизировать позицию.</translation>
+    </message>
+    <message>
+        <source>Zoom factor for the face region. Applied before the patch is fed into the pose estimation model. There is a sweet spot near 1.</source>
+        <translation>Фактор масштабирования области лица. Применяется перед передачей кадра в модель определения позиции. Наилучшие результаты близки к 1</translation>
+    </message>
+    <message>
+        <source>Select Pose Net ONNX</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>&lt;the pose net file&gt;</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Select the pose network. Changes take affect on the next tracker start</source>
+        <translation type="unfinished"></translation>
+    </message>
+</context>
+<context>
+    <name>neuralnet_tracker_ns::NeuralNetDialog</name>
+    <message>
+        <source>Default</source>
+        <translation>По умолчанию</translation>
+    </message>
+    <message>
+        <source>Tracker Offline</source>
+        <translation>Трекер выключен</translation>
+    </message>
+    <message>
+        <source>%1x%2 @ %3 FPS / Inference: %4 ms</source>
+        <translation>%1x%2 @ %3 FPS; Время оценки: %4 мс</translation>
+    </message>
+    <message>
+        <source>%1 yaw samples. Yaw more to %2 samples for stable calibration.</source>
+        <translation>Сэмплов поворота: %1.
+Поворачивайте голову в стороны до %2 сэмплов для стабильной калибрации.</translation>
+    </message>
+    <message>
+        <source>%1 pitch samples. Pitch more to %2 samples for stable calibration.</source>
+        <translation>Сэмплов наклона: %1.
+Наклоняйте голову вниз/вверх до %2 сэмплов для стабильной калибрации.</translation>
+    </message>
+    <message>
+        <source>%1 samples. Over %2, good!</source>
+        <translation>%1 сэмплов. Более %2, достаточно.</translation>
+    </message>
+    <message>
+        <source>Stop calibration</source>
+        <translation>Остановить калибровку</translation>
+    </message>
+    <message>
+        <source>Start calibration</source>
+        <translation>Начать калибровку</translation>
+    </message>
+    <message>
+        <source>Select Pose Net ONNX</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>ONNX Files (*.onnx)</source>
+        <translation type="unfinished"></translation>
+    </message>
+</context>
+</TS>
diff --git a/tracker-neuralnet/lang/stub.ts b/tracker-neuralnet/lang/stub.ts
new file mode 100644
index 00000000..9609f05e
--- /dev/null
+++ b/tracker-neuralnet/lang/stub.ts
@@ -0,0 +1,171 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE TS>
+<TS version="2.1">
+<context>
+    <name>Form</name>
+    <message>
+        <source>Tracker settings</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Diagonal FOV</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Camera settings</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Frames per second</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Camera name</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Camera Configuration</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Head Center Offset</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source> mm</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Use only yaw and pitch while calibrating.
+Don&apos;t roll or change position.</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Start calibration</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Right</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Forward</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Up</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Show Network Input</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>MJPEG</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Tuning / Debug</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>ROI Smoothing Alpha</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>ROI Zoom</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Thread Count</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Resolution</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Field of view. Needed to transform the pose to world coordinates.</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Requested video frame rate. Actual setting may not be supported by the camera.</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>The requested resolution for cases where the camera delivers maximum frame rate only for a particular resolution. The image may still be downscaled to the internal resolution.</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Number of threads. Can be used to balance the CPU load between the game and the tracker.</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Show the image patch that the pose estimation model sees.</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Amount of smoothing of the face region coordinates. Can help stabilize the pose.</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Zoom factor for the face region. Applied before the patch is fed into the pose estimation model. There is a sweet spot near 1.</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Select Pose Net ONNX</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>&lt;the pose net file&gt;</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Select the pose network. Changes take affect on the next tracker start</source>
+        <translation type="unfinished"></translation>
+    </message>
+</context>
+<context>
+    <name>neuralnet_tracker_ns::NeuralNetDialog</name>
+    <message>
+        <source>Default</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Tracker Offline</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>%1x%2 @ %3 FPS / Inference: %4 ms</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>%1 yaw samples. Yaw more to %2 samples for stable calibration.</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>%1 pitch samples. Pitch more to %2 samples for stable calibration.</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>%1 samples. Over %2, good!</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Stop calibration</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Start calibration</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Select Pose Net ONNX</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>ONNX Files (*.onnx)</source>
+        <translation type="unfinished"></translation>
+    </message>
+</context>
+</TS>
diff --git a/tracker-neuralnet/lang/zh_CN.ts b/tracker-neuralnet/lang/zh_CN.ts
new file mode 100644
index 00000000..53da04ae
--- /dev/null
+++ b/tracker-neuralnet/lang/zh_CN.ts
@@ -0,0 +1,172 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE TS>
+<TS version="2.1" language="zh_CN">
+<context>
+    <name>Form</name>
+    <message>
+        <source>Tracker settings</source>
+        <translation>追踪器设置</translation>
+    </message>
+    <message>
+        <source>Diagonal FOV</source>
+        <translation>对角FOV</translation>
+    </message>
+    <message>
+        <source>Camera name</source>
+        <translation>相机名</translation>
+    </message>
+    <message>
+        <source>Frames per second</source>
+        <translation>FPS</translation>
+    </message>
+    <message>
+        <source>Camera settings</source>
+        <translation>相机设置</translation>
+    </message>
+    <message>
+        <source>Camera Configuration</source>
+        <translation>相机配置</translation>
+    </message>
+    <message>
+        <source>Head Center Offset</source>
+        <translation>头部归中补偿</translation>
+    </message>
+    <message>
+        <source> mm</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Use only yaw and pitch while calibrating.
+Don&apos;t roll or change position.</source>
+        <translation>在校准时只使用偏航和俯仰,
+不要滚转或是改变位置. </translation>
+    </message>
+    <message>
+        <source>Start calibration</source>
+        <translation>开始校准</translation>
+    </message>
+    <message>
+        <source>Right</source>
+        <translation>向右</translation>
+    </message>
+    <message>
+        <source>Forward</source>
+        <translation>向前</translation>
+    </message>
+    <message>
+        <source>Up</source>
+        <translation>向上</translation>
+    </message>
+    <message>
+        <source>Show Network Input</source>
+        <translation>展示神经网络输入</translation>
+    </message>
+    <message>
+        <source>MJPEG</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Tuning / Debug</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>ROI Smoothing Alpha</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>ROI Zoom</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Thread Count</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Resolution</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Field of view. Needed to transform the pose to world coordinates.</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Requested video frame rate. Actual setting may not be supported by the camera.</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>The requested resolution for cases where the camera delivers maximum frame rate only for a particular resolution. The image may still be downscaled to the internal resolution.</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Number of threads. Can be used to balance the CPU load between the game and the tracker.</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Show the image patch that the pose estimation model sees.</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Amount of smoothing of the face region coordinates. Can help stabilize the pose.</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Zoom factor for the face region. Applied before the patch is fed into the pose estimation model. There is a sweet spot near 1.</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Select the pose network. Changes take affect on the next tracker start</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Select Pose Net ONNX</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>&lt;the pose net file&gt;</source>
+        <translation type="unfinished"></translation>
+    </message>
+</context>
+<context>
+    <name>neuralnet_tracker_ns::NeuralNetDialog</name>
+    <message>
+        <source>Default</source>
+        <translation>默认</translation>
+    </message>
+    <message>
+        <source>Tracker Offline</source>
+        <translation>追踪器离线</translation>
+    </message>
+    <message>
+        <source>%1x%2 @ %3 FPS / Inference: %4 ms</source>
+        <translation>%1x%2 @ %3 FPS / 推理: %4 ms</translation>
+    </message>
+    <message>
+        <source>%1 yaw samples. Yaw more to %2 samples for stable calibration.</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>%1 pitch samples. Pitch more to %2 samples for stable calibration.</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>%1 samples. Over %2, good!</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>Stop calibration</source>
+        <translation>结束校准</translation>
+    </message>
+    <message>
+        <source>Start calibration</source>
+        <translation>开始校准</translation>
+    </message>
+    <message>
+        <source>Select Pose Net ONNX</source>
+        <translation type="unfinished"></translation>
+    </message>
+    <message>
+        <source>ONNX Files (*.onnx)</source>
+        <translation type="unfinished"></translation>
+    </message>
+</context>
+</TS>
diff --git a/tracker-neuralnet/model_adapters.cpp b/tracker-neuralnet/model_adapters.cpp
new file mode 100644
index 00000000..f53478af
--- /dev/null
+++ b/tracker-neuralnet/model_adapters.cpp
@@ -0,0 +1,416 @@
+#include "model_adapters.h"
+
+#include "compat/timer.hpp"
+
+#include <opencv2/core.hpp>
+#include <opencv2/core/quaternion.hpp>
+#include <opencv2/imgproc.hpp>
+
+#include <QDebug>
+
+namespace neuralnet_tracker_ns
+{
+
+
+float sigmoid(float x)
+{
+    return 1.f/(1.f + std::exp(-x));
+}
+
+
+// Defined in ftnoir_tracker_neuralnet.cpp
+// Normally we wouldn't need it here. However ... see below.
+cv::Quatf image_to_world(cv::Quatf q);
+
+
+cv::Quatf world_to_image(cv::Quatf q)
+{
+    // It's its own inverse.
+    return image_to_world(q);
+}
+
+
+cv::Rect2f unnormalize(const cv::Rect2f &r, int h, int w)
+{
+    auto unnorm = [](float x) -> float { return 0.5*(x+1); };
+    auto tl = r.tl();
+    auto br = r.br();
+    auto x0 = unnorm(tl.x)*w;
+    auto y0 = unnorm(tl.y)*h;
+    auto x1 = unnorm(br.x)*w;
+    auto y1 = unnorm(br.y)*h;
+    return {
+        x0, y0, x1-x0, y1-y0
+    };
+}
+
+
+// Returns width and height of the input tensor, or throws.
+// Expects the model to take one tensor as input that must
+// have the shape B x C x H x W, where B=C=1.
+cv::Size get_input_image_shape(const Ort::Session &session)
+{
+    if (session.GetInputCount() < 1)
+        throw std::invalid_argument("Model must take at least one input tensor");
+    const std::vector<std::int64_t> shape = 
+        session.GetInputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape();
+    if (shape.size() != 4)
+        throw std::invalid_argument("Model takes the input tensor in the wrong shape");
+    return { static_cast<int>(shape[3]), static_cast<int>(shape[2]) };
+}
+
+
+Ort::Value create_tensor(const Ort::TypeInfo& info, Ort::Allocator& alloc)
+{
+    const auto shape = info.GetTensorTypeAndShapeInfo().GetShape();
+    auto t = Ort::Value::CreateTensor<float>(
+        alloc, shape.data(), shape.size());
+    memset(t.GetTensorMutableData<float>(), 0, sizeof(float)*info.GetTensorTypeAndShapeInfo().GetElementCount());
+    return t;
+}
+
+
+int find_input_intensity_quantile(const cv::Mat& frame, float percentage)
+{
+    const int channels[] = { 0 };
+    const int hist_size[] = { 256 };
+    float range[] = { 0, 256 };
+    const float* ranges[] = { range };
+    cv::Mat hist;
+    cv::calcHist(&frame, 1,  channels, cv::Mat(), hist, 1, hist_size, ranges, true, false);
+    int gray_level = 0;
+    const int num_pixels_quantile = frame.total()*percentage*0.01f;
+    int num_pixels_accum = 0;
+    for (int i=0; i<hist_size[0]; ++i)
+    {
+        num_pixels_accum += hist.at<float>(i);
+        if (num_pixels_accum > num_pixels_quantile)
+        {
+            gray_level = i;
+            break;
+        }
+    }
+    return gray_level;
+}
+
+
+// Automatic brightness adjustment. Scales brightness to lie between -.5 and 0.5, roughly.
+void normalize_brightness(const cv::Mat& frame, cv::Mat& out)
+{
+    const float pct = 90;
+
+    const int brightness = find_input_intensity_quantile(frame, pct);
+
+    const double alpha = brightness<127 ? (pct/100.f*0.5f/std::max(5,brightness)) : 1./255;
+    const double beta = -0.5;
+
+    frame.convertTo(out, CV_32F, alpha, beta);
+}
+
+
+
+Localizer::Localizer(Ort::MemoryInfo &allocator_info, Ort::Session &&session) :
+    session_{std::move(session)},
+    scaled_frame_(INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH, CV_8U),
+    input_mat_(INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH, CV_32F)
+{
+    // Only works when input_mat does not reallocated memory ...which it should not.
+    // Non-owning memory reference to input_mat?
+    // Note: shape = (bach x channels x h x w)
+    const std::int64_t input_shape[4] = { 1, 1, INPUT_IMG_HEIGHT, INPUT_IMG_WIDTH };
+    input_val_ = Ort::Value::CreateTensor<float>(allocator_info, input_mat_.ptr<float>(0), input_mat_.total(), input_shape, 4);
+
+    const std::int64_t output_shape[2] = { 1, 5 };
+    output_val_ = Ort::Value::CreateTensor<float>(allocator_info, results_.data(), results_.size(), output_shape, 2);
+}
+
+
+std::pair<float, cv::Rect2f> Localizer::run(
+    const cv::Mat &frame)
+{
+    auto p = input_mat_.ptr(0);
+
+    cv::resize(frame, scaled_frame_, { INPUT_IMG_WIDTH, INPUT_IMG_HEIGHT }, 0, 0, cv::INTER_AREA);
+    scaled_frame_.convertTo(input_mat_, CV_32F, 1./255., -0.5);
+
+    assert (input_mat_.ptr(0) == p);
+    assert (!input_mat_.empty() && input_mat_.isContinuous());
+    assert (input_mat_.cols == INPUT_IMG_WIDTH && input_mat_.rows == INPUT_IMG_HEIGHT);
+
+    const char* input_names[] = {"x"};
+    const char* output_names[] = {"logit_box"};
+
+    Timer t; t.start();
+
+    session_.Run(Ort::RunOptions{nullptr}, input_names, &input_val_, 1, output_names, &output_val_, 1);
+
+    last_inference_time_ = t.elapsed_ms();
+
+    const cv::Rect2f roi = unnormalize(cv::Rect2f{
+        results_[1],
+        results_[2],
+        results_[3]-results_[1], // Width
+        results_[4]-results_[2] // Height
+    }, frame.rows, frame.cols);
+    const float score = sigmoid(results_[0]);
+
+    return { score, roi };
+}
+
+
+double Localizer::last_inference_time_millis() const
+{
+    return last_inference_time_;
+}
+
+
+std::string PoseEstimator::get_network_input_name(size_t i) const
+{
+#if ORT_API_VERSION >= 12
+    return std::string(&*session_.GetInputNameAllocated(i, allocator_));
+#else
+    return std::string(session_.GetInputName(i, allocator_));
+#endif
+}
+
+std::string PoseEstimator::get_network_output_name(size_t i) const
+{
+#if ORT_API_VERSION >= 12
+    return std::string(&*session_.GetOutputNameAllocated(i, allocator_));
+#else
+    return std::string(session_.GetOutputName(i, allocator_));
+#endif
+}
+
+PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&session) 
+    : model_version_{session.GetModelMetadata().GetVersion()}
+    , session_{std::move(session)}
+    , allocator_{session_, allocator_info}
+{
+    using namespace std::literals::string_literals;
+
+    if (session_.GetOutputCount() < 2)
+        throw std::runtime_error("Invalid Model: must have at least two outputs");
+
+    // WARNING: Messy model compatibility issues!
+    // When reading the initial model release, it did not have the version field set.
+    // Reading it here will result in some unspecified value. It's probably UB due to
+    // reading uninitialized memory. But there is little choice.
+    // Now, detection of this old version is messy ... we have to guess based on the 
+    // number we get. Getting an uninitialized value matching a valid version is unlikely.
+    // But the real problem is that this line must be updated whenever we want to bump the
+    // version number!!
+    if (model_version_ <= 0 || model_version_ > 4)
+        model_version_ = 1;
+
+    const cv::Size input_image_shape = get_input_image_shape(session_);
+
+    scaled_frame_ = cv::Mat(input_image_shape, CV_8U, cv::Scalar(0));
+    input_mat_ = cv::Mat(input_image_shape, CV_32F, cv::Scalar(0.f));
+
+    {
+        const std::int64_t input_shape[4] = { 1, 1, input_image_shape.height, input_image_shape.width };
+        input_val_.push_back(
+            Ort::Value::CreateTensor<float>(allocator_info, input_mat_.ptr<float>(0), input_mat_.total(), input_shape, 4));
+    }
+
+    struct TensorSpec
+    {
+        std::vector<int64_t> shape;
+        float* buffer = nullptr;
+        size_t element_count = 0;
+        bool available = false;
+    };
+
+    std::unordered_map<std::string, TensorSpec> understood_outputs = {
+        { "pos_size", TensorSpec{ { 1, 3 }, &output_coord_[0], output_coord_.rows } },
+        { "quat", TensorSpec{ { 1, 4},  &output_quat_[0], output_quat_.rows } },
+        { "box", TensorSpec{ { 1, 4}, &output_box_[0], output_box_.rows } },
+        { "rotaxis_scales_tril", TensorSpec{ {1, 3, 3}, output_rotaxis_scales_tril_.val, 9 }},
+        { "rotaxis_std", TensorSpec{ {1, 3, 3}, output_rotaxis_scales_tril_.val, 9 }}, // TODO: Delete when old models aren't used any more
+        { "pos_size_std", TensorSpec{ {1, 3}, output_coord_scales_std_.val, output_coord_scales_std_.rows}},
+        { "pos_size_scales", TensorSpec{ {1, 3}, output_coord_scales_std_.val, output_coord_scales_std_.rows}},
+        { "pos_size_scales_tril", TensorSpec{ {1, 3, 3}, output_coord_scales_tril_.val, 9}}
+    };
+
+    qDebug() << "Pose model inputs (" << session_.GetInputCount() << ")";
+    qDebug() << "Pose model outputs (" << session_.GetOutputCount() << "):";
+    output_names_.resize(session_.GetOutputCount());
+    output_c_names_.resize(session_.GetOutputCount());
+    for (size_t i=0; i<session_.GetOutputCount(); ++i)
+    {
+        const std::string name = get_network_output_name(i);
+        const auto& output_info = session_.GetOutputTypeInfo(i);
+        const auto& onnx_tensor_spec = output_info.GetTensorTypeAndShapeInfo();
+        auto my_tensor_spec_it = understood_outputs.find(name);
+
+        qDebug() << "\t" << name.c_str() << " (" << onnx_tensor_spec.GetShape() << ") dtype: " <<  onnx_tensor_spec.GetElementType() << " " <<
+            (my_tensor_spec_it != understood_outputs.end() ? "ok" : "unknown");
+
+        if (my_tensor_spec_it != understood_outputs.end())
+        {
+            TensorSpec& t = my_tensor_spec_it->second;
+            if (onnx_tensor_spec.GetShape() != t.shape || 
+                onnx_tensor_spec.GetElementType() != Ort::TypeToTensorType<float>::type)
+                throw std::runtime_error("Invalid output tensor spec for "s + name);
+            output_val_.push_back(Ort::Value::CreateTensor<float>(
+                allocator_info, t.buffer, t.element_count, t.shape.data(), t.shape.size()));
+            t.available = true;
+        }
+        else
+        {
+            // Create tensor regardless and ignore output
+            output_val_.push_back(create_tensor(output_info, allocator_));
+        }
+        output_names_[i] = name;
+        output_c_names_[i] = output_names_[i].c_str();
+    }
+
+    has_uncertainty_ = understood_outputs.at("rotaxis_scales_tril").available ||
+                       understood_outputs.at("rotaxis_std").available;
+    has_uncertainty_ &= understood_outputs.at("pos_size_std").available ||
+                        understood_outputs.at("pos_size_scales").available ||
+                        understood_outputs.at("pos_size_scales_tril").available;
+    pos_scale_uncertainty_is_matrix_ = understood_outputs.at("pos_size_scales_tril").available;
+
+    input_names_.resize(session_.GetInputCount());
+    input_c_names_.resize(session_.GetInputCount());
+    for (size_t i = 0; i < session_.GetInputCount(); ++i)
+    {
+        input_names_[i] = get_network_input_name(i);
+        input_c_names_[i] = input_names_[i].c_str();
+    }
+
+    assert (input_names_.size() == input_val_.size());
+    assert (output_names_.size() == output_val_.size());
+}
+
+
+std::optional<PoseEstimator::Face> PoseEstimator::run(
+    const cv::Mat &frame, const cv::Rect &box)
+{
+    cv::Mat cropped;
+
+    const int patch_size = std::max(box.width, box.height)*1.05;
+    const cv::Point2f patch_center = {
+        std::clamp<float>(box.x + 0.5f*box.width, 0.f, frame.cols),
+        std::clamp<float>(box.y + 0.5f*box.height, 0.f, frame.rows)
+    };
+    cv::getRectSubPix(frame, {patch_size, patch_size}, patch_center, cropped);
+
+    // Will get failure if patch_center is outside image boundaries settings.
+    // Have to catch this case.
+    if (cropped.rows != patch_size || cropped.cols != patch_size)
+        return {};
+
+    [[maybe_unused]] auto* p = input_mat_.ptr(0);
+
+    cv::resize(cropped, scaled_frame_, scaled_frame_.size(), 0, 0, cv::INTER_AREA);
+
+    normalize_brightness(scaled_frame_, input_mat_);
+
+    assert (input_mat_.ptr(0) == p);
+    assert (!input_mat_.empty() && input_mat_.isContinuous());
+
+    Timer t; t.start();
+
+    try
+    {
+        session_.Run(
+            Ort::RunOptions{ nullptr }, 
+            input_c_names_.data(),
+            input_val_.data(), 
+            input_val_.size(), 
+            output_c_names_.data(),
+            output_val_.data(),
+            output_val_.size());
+    }
+    catch (const Ort::Exception &e)
+    {
+        qDebug() << "Failed to run the model: " << e.what();
+        return {};
+    }
+
+    last_inference_time_ = t.elapsed_ms();
+
+    // Perform coordinate transformation.
+    // From patch-local normalized in [-1,1] to
+    // frame unnormalized pixel.
+
+    cv::Matx33f center_size_cov_tril = {};
+    if (has_uncertainty_)
+    {
+        if (pos_scale_uncertainty_is_matrix_)
+        {
+            center_size_cov_tril = output_coord_scales_tril_;
+        }
+        else
+        {
+            center_size_cov_tril(0,0) = output_coord_scales_std_[0];
+            center_size_cov_tril(1,1) = output_coord_scales_std_[1];
+            center_size_cov_tril(2,2) = output_coord_scales_std_[2];
+        }
+        center_size_cov_tril *= patch_size*0.5f;
+    }
+
+    const cv::Point2f center = patch_center + 
+        (0.5f*patch_size)*cv::Point2f{output_coord_[0], output_coord_[1]};
+    const float size = patch_size*0.5f*output_coord_[2];
+
+    // Following Eigen which uses quat components in the order w, x, y, z.
+    // As does OpenCV
+    cv::Quatf rotation = { 
+        output_quat_[3], 
+        output_quat_[0], 
+        output_quat_[1], 
+        output_quat_[2] };
+
+    // Should be lower triangular. If not maybe something is wrong with memory layout ... or the model.
+    assert(output_rotaxis_scales_tril_(0, 1) == 0);
+    assert(output_rotaxis_scales_tril_(0, 2) == 0);
+    assert(output_rotaxis_scales_tril_(1, 2) == 0);
+    assert(center_size_cov_tril(0, 1) == 0);
+    assert(center_size_cov_tril(0, 2) == 0);
+    assert(center_size_cov_tril(1, 2) == 0);
+
+    cv::Matx33f rotaxis_scales_tril = output_rotaxis_scales_tril_;
+    
+    if (model_version_ < 2)
+    {
+        // Due to a change in coordinate conventions
+        rotation = world_to_image(rotation);
+    }
+
+    const cv::Rect2f outbox = {
+        patch_center.x + (0.5f*patch_size)*output_box_[0],
+        patch_center.y + (0.5f*patch_size)*output_box_[1],
+        0.5f*patch_size*(output_box_[2]-output_box_[0]),
+        0.5f*patch_size*(output_box_[3]-output_box_[1])
+    };
+
+    return std::optional<Face>({
+        rotation, rotaxis_scales_tril, outbox, center, size, center_size_cov_tril
+    });
+}
+
+
+cv::Mat PoseEstimator::last_network_input() const
+{
+    assert(!input_mat_.empty());
+    cv::Mat ret;
+    input_mat_.convertTo(ret, CV_8U, 255., 127.);
+    cv::cvtColor(ret, ret, cv::COLOR_GRAY2RGB);
+    return ret;
+}
+
+
+double PoseEstimator::last_inference_time_millis() const
+{
+    return last_inference_time_;
+}
+
+
+
+
+
+} // namespace neuralnet_tracker_ns
diff --git a/tracker-neuralnet/model_adapters.h b/tracker-neuralnet/model_adapters.h
new file mode 100644
index 00000000..c1aaa6de
--- /dev/null
+++ b/tracker-neuralnet/model_adapters.h
@@ -0,0 +1,105 @@
+#pragma once
+
+#include <optional>
+#include <array>
+#include <vector>
+#include <string>
+
+#include <onnxruntime_cxx_api.h>
+#include <opencv2/core.hpp>
+#include "opencv_contrib.h"
+
+
+namespace neuralnet_tracker_ns
+{
+
+// Generally useful sigmoid function
+float sigmoid(float x);
+
+
+class Localizer
+{
+    public:
+        Localizer(Ort::MemoryInfo &allocator_info,
+                    Ort::Session &&session);
+
+        // Returns bounding wrt image coordinate of the input image
+        // The preceeding float is the score for being a face normalized to [0,1].
+        std::pair<float, cv::Rect2f> run(
+            const cv::Mat &frame);
+
+        double last_inference_time_millis() const;
+    private:
+        inline static constexpr int INPUT_IMG_WIDTH = 288;
+        inline static constexpr int INPUT_IMG_HEIGHT = 224;
+        Ort::Session session_{nullptr};
+        // Inputs / outputs
+        cv::Mat scaled_frame_{}, input_mat_{};
+        Ort::Value input_val_{nullptr}, output_val_{nullptr};
+        std::array<float, 5> results_;
+        double last_inference_time_ = 0;
+};
+
+
+class PoseEstimator
+{
+    public:
+        struct Face
+        {
+            cv::Quatf rotation;
+            cv::Matx33f rotaxis_cov_tril; // Lower triangular factor of Cholesky decomposition
+            cv::Rect2f box;
+            cv::Point2f center;
+            float size;
+            cv::Matx33f center_size_cov_tril; // Lower triangular factor of Cholesky decomposition
+        };
+
+        PoseEstimator(Ort::MemoryInfo &allocator_info,
+                        Ort::Session &&session);
+        /** Inference
+        *
+        * Coordinates are defined wrt. the image space of the input `frame`.
+        * X goes right, Z (depth) into the image, Y points down (like pixel coordinates values increase from top to bottom)
+        */
+        std::optional<Face> run(const cv::Mat &frame, const cv::Rect &box);
+        // Returns an image compatible with the 'frame' image for displaying.
+        cv::Mat last_network_input() const;
+        double last_inference_time_millis() const;
+        bool has_uncertainty() const { return has_uncertainty_; }
+
+    private:
+        std::string get_network_input_name(size_t i) const;
+        std::string get_network_output_name(size_t i) const;
+        int64_t model_version_ = 0;  // Queried meta data from the ONNX file
+        Ort::Session session_{nullptr};  // ONNX's runtime context for running the model
+        mutable Ort::Allocator allocator_;   // Memory allocator for tensors
+        // Inputs
+        cv::Mat scaled_frame_{}, input_mat_{};  // Input. One is the original crop, the other is rescaled (?)
+        std::vector<Ort::Value> input_val_;    // Tensors to put into the model
+        std::vector<std::string> input_names_; // Refers to the names in the onnx model.
+        std::vector<const char *> input_c_names_; // Refers to the C names in the onnx model.
+        // Outputs
+        cv::Vec<float, 3> output_coord_{};  // 2d Coordinate and head size output.
+        cv::Vec<float, 4> output_quat_{};   //  Quaternion output
+        cv::Vec<float, 4> output_box_{};    // Bounding box output
+        cv::Matx33f output_rotaxis_scales_tril_{}; // Lower triangular matrix of LLT factorization of covariance of rotation vector as offset from output quaternion
+        cv::Matx33f output_coord_scales_tril_{}; // Lower triangular factor
+        cv::Vec3f output_coord_scales_std_{}; // Depending on the model, alternatively a 3d vector with standard deviations.
+        std::vector<Ort::Value> output_val_; // Tensors to put the model outputs in.
+        std::vector<std::string> output_names_; // Refers to the names in the onnx model.
+        std::vector<const char *> output_c_names_; // Refers to the C names in the onnx model.
+        // More bookkeeping
+        double last_inference_time_ = 0;
+        bool has_uncertainty_ = false;
+        bool pos_scale_uncertainty_is_matrix_ = false;
+};
+
+
+// Finds the intensity where x percent of pixels have less intensity than that.
+int find_input_intensity_quantile(const cv::Mat& frame, float percentage);
+
+// Adjust brightness levels to full range and scales the value range to [-0.5, 0.5]
+void normalize_brightness(const cv::Mat& frame, cv::Mat& out);
+
+
+} // namespace neuralnet_tracker_ns
diff --git a/tracker-neuralnet/models/head-localizer.onnx b/tracker-neuralnet/models/head-localizer.onnx
new file mode 100644
index 00000000..c128f89d
--- /dev/null
+++ b/tracker-neuralnet/models/head-localizer.onnx
diff --git a/tracker-neuralnet/models/head-pose-0.2-big.onnx b/tracker-neuralnet/models/head-pose-0.2-big.onnx
new file mode 100644
index 00000000..e53fd831
--- /dev/null
+++ b/tracker-neuralnet/models/head-pose-0.2-big.onnx
diff --git a/tracker-neuralnet/models/head-pose-0.2-small.onnx b/tracker-neuralnet/models/head-pose-0.2-small.onnx
new file mode 100644
index 00000000..f2b64219
--- /dev/null
+++ b/tracker-neuralnet/models/head-pose-0.2-small.onnx
diff --git a/tracker-neuralnet/models/head-pose-0.3-big-quantized.onnx b/tracker-neuralnet/models/head-pose-0.3-big-quantized.onnx
new file mode 100644
index 00000000..7f875c63
--- /dev/null
+++ b/tracker-neuralnet/models/head-pose-0.3-big-quantized.onnx
diff --git a/tracker-neuralnet/neuralnet-tracker.qrc b/tracker-neuralnet/neuralnet-tracker.qrc
new file mode 100644
index 00000000..d30ec313
--- /dev/null
+++ b/tracker-neuralnet/neuralnet-tracker.qrc
@@ -0,0 +1,5 @@
+<RCC>
+    <qresource prefix="/">
+        <file>images/neuralnet.png</file>
+    </qresource>
+</RCC>
diff --git a/tracker-neuralnet/neuralnet-trackercontrols.ui b/tracker-neuralnet/neuralnet-trackercontrols.ui
new file mode 100644
index 00000000..ae2450b4
--- /dev/null
+++ b/tracker-neuralnet/neuralnet-trackercontrols.ui
@@ -0,0 +1,697 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<ui version="4.0">
+ <class>Form</class>
+ <widget class="QWidget" name="Form">
+  <property name="windowModality">
+   <enum>Qt::NonModal</enum>
+  </property>
+  <property name="geometry">
+   <rect>
+    <x>0</x>
+    <y>0</y>
+    <width>651</width>
+    <height>432</height>
+   </rect>
+  </property>
+  <property name="windowTitle">
+   <string>Tracker settings</string>
+  </property>
+  <layout class="QGridLayout" name="gridLayout">
+   <item row="4" column="0">
+    <widget class="QGroupBox" name="groupBox_10">
+     <property name="sizePolicy">
+      <sizepolicy hsizetype="Preferred" vsizetype="Preferred">
+       <horstretch>0</horstretch>
+       <verstretch>0</verstretch>
+      </sizepolicy>
+     </property>
+     <property name="autoFillBackground">
+      <bool>true</bool>
+     </property>
+     <property name="title">
+      <string>Head Center Offset</string>
+     </property>
+     <layout class="QGridLayout" name="gridLayout_5">
+      <item row="0" column="0">
+       <widget class="QFrame" name="frame_4">
+        <property name="sizePolicy">
+         <sizepolicy hsizetype="Preferred" vsizetype="Preferred">
+          <horstretch>0</horstretch>
+          <verstretch>0</verstretch>
+         </sizepolicy>
+        </property>
+        <property name="maximumSize">
+         <size>
+          <width>16777215</width>
+          <height>16777215</height>
+         </size>
+        </property>
+        <property name="frameShape">
+         <enum>QFrame::NoFrame</enum>
+        </property>
+        <property name="frameShadow">
+         <enum>QFrame::Raised</enum>
+        </property>
+        <layout class="QGridLayout" name="gridLayout_11">
+         <property name="sizeConstraint">
+          <enum>QLayout::SetDefaultConstraint</enum>
+         </property>
+         <property name="verticalSpacing">
+          <number>0</number>
+         </property>
+         <item row="2" column="0">
+          <widget class="QLabel" name="label_66">
+           <property name="sizePolicy">
+            <sizepolicy hsizetype="Maximum" vsizetype="Preferred">
+             <horstretch>0</horstretch>
+             <verstretch>0</verstretch>
+            </sizepolicy>
+           </property>
+           <property name="text">
+            <string>Right</string>
+           </property>
+          </widget>
+         </item>
+         <item row="0" column="0">
+          <widget class="QLabel" name="label_61">
+           <property name="sizePolicy">
+            <sizepolicy hsizetype="Maximum" vsizetype="Preferred">
+             <horstretch>0</horstretch>
+             <verstretch>0</verstretch>
+            </sizepolicy>
+           </property>
+           <property name="text">
+            <string>Forward</string>
+           </property>
+          </widget>
+         </item>
+         <item row="1" column="0">
+          <widget class="QLabel" name="label_62">
+           <property name="sizePolicy">
+            <sizepolicy hsizetype="Maximum" vsizetype="Preferred">
+             <horstretch>0</horstretch>
+             <verstretch>0</verstretch>
+            </sizepolicy>
+           </property>
+           <property name="text">
+            <string>Up</string>
+           </property>
+          </widget>
+         </item>
+         <item row="0" column="1">
+          <widget class="QSpinBox" name="tx_spin">
+           <property name="maximumSize">
+            <size>
+             <width>150</width>
+             <height>16777215</height>
+            </size>
+           </property>
+           <property name="suffix">
+            <string> mm</string>
+           </property>
+           <property name="minimum">
+            <number>-65535</number>
+           </property>
+           <property name="maximum">
+            <number>65536</number>
+           </property>
+          </widget>
+         </item>
+         <item row="1" column="1">
+          <widget class="QSpinBox" name="ty_spin">
+           <property name="maximumSize">
+            <size>
+             <width>150</width>
+             <height>16777215</height>
+            </size>
+           </property>
+           <property name="suffix">
+            <string> mm</string>
+           </property>
+           <property name="minimum">
+            <number>-65535</number>
+           </property>
+           <property name="maximum">
+            <number>65536</number>
+           </property>
+          </widget>
+         </item>
+         <item row="2" column="1">
+          <widget class="QSpinBox" name="tz_spin">
+           <property name="maximumSize">
+            <size>
+             <width>150</width>
+             <height>16777215</height>
+            </size>
+           </property>
+           <property name="suffix">
+            <string> mm</string>
+           </property>
+           <property name="minimum">
+            <number>-65535</number>
+           </property>
+           <property name="maximum">
+            <number>65536</number>
+           </property>
+          </widget>
+         </item>
+        </layout>
+       </widget>
+      </item>
+      <item row="0" column="1">
+       <widget class="QFrame" name="frame_5">
+        <property name="sizePolicy">
+         <sizepolicy hsizetype="Preferred" vsizetype="Expanding">
+          <horstretch>0</horstretch>
+          <verstretch>0</verstretch>
+         </sizepolicy>
+        </property>
+        <property name="minimumSize">
+         <size>
+          <width>260</width>
+          <height>0</height>
+         </size>
+        </property>
+        <property name="frameShape">
+         <enum>QFrame::NoFrame</enum>
+        </property>
+        <property name="frameShadow">
+         <enum>QFrame::Raised</enum>
+        </property>
+        <layout class="QVBoxLayout" name="verticalLayout_2">
+         <item>
+          <widget class="QLabel" name="label_59">
+           <property name="text">
+            <string>Use only yaw and pitch while calibrating.
+Don't roll or change position.</string>
+           </property>
+           <property name="alignment">
+            <set>Qt::AlignCenter</set>
+           </property>
+           <property name="wordWrap">
+            <bool>true</bool>
+           </property>
+           <property name="openExternalLinks">
+            <bool>false</bool>
+           </property>
+          </widget>
+         </item>
+         <item>
+          <widget class="QLabel" name="sample_count_display">
+           <property name="sizePolicy">
+            <sizepolicy hsizetype="Minimum" vsizetype="Maximum">
+             <horstretch>0</horstretch>
+             <verstretch>0</verstretch>
+            </sizepolicy>
+           </property>
+           <property name="frameShape">
+            <enum>QFrame::Panel</enum>
+           </property>
+           <property name="frameShadow">
+            <enum>QFrame::Sunken</enum>
+           </property>
+           <property name="text">
+            <string/>
+           </property>
+           <property name="wordWrap">
+            <bool>true</bool>
+           </property>
+          </widget>
+         </item>
+         <item>
+          <widget class="QPushButton" name="tcalib_button">
+           <property name="enabled">
+            <bool>false</bool>
+           </property>
+           <property name="text">
+            <string>Start calibration</string>
+           </property>
+           <property name="checkable">
+            <bool>true</bool>
+           </property>
+          </widget>
+         </item>
+        </layout>
+       </widget>
+      </item>
+     </layout>
+    </widget>
+   </item>
+   <item row="8" column="0">
+    <widget class="QDialogButtonBox" name="buttonBox">
+     <property name="standardButtons">
+      <set>QDialogButtonBox::Cancel|QDialogButtonBox::Ok</set>
+     </property>
+    </widget>
+   </item>
+   <item row="2" column="0">
+    <widget class="QGroupBox" name="groupBox">
+     <property name="sizePolicy">
+      <sizepolicy hsizetype="Preferred" vsizetype="Preferred">
+       <horstretch>0</horstretch>
+       <verstretch>0</verstretch>
+      </sizepolicy>
+     </property>
+     <property name="autoFillBackground">
+      <bool>true</bool>
+     </property>
+     <property name="title">
+      <string>Camera Configuration</string>
+     </property>
+     <property name="flat">
+      <bool>false</bool>
+     </property>
+     <property name="checkable">
+      <bool>false</bool>
+     </property>
+     <layout class="QHBoxLayout" name="horizontalLayout">
+      <property name="spacing">
+       <number>10</number>
+      </property>
+      <property name="bottomMargin">
+       <number>8</number>
+      </property>
+      <item>
+       <layout class="QGridLayout" name="gridLayout_3">
+        <property name="sizeConstraint">
+         <enum>QLayout::SetDefaultConstraint</enum>
+        </property>
+        <property name="leftMargin">
+         <number>0</number>
+        </property>
+        <property name="topMargin">
+         <number>0</number>
+        </property>
+        <property name="rightMargin">
+         <number>0</number>
+        </property>
+        <property name="bottomMargin">
+         <number>0</number>
+        </property>
+        <property name="horizontalSpacing">
+         <number>0</number>
+        </property>
+        <property name="verticalSpacing">
+         <number>2</number>
+        </property>
+        <item row="0" column="1">
+         <widget class="QComboBox" name="cameraName">
+          <property name="sizePolicy">
+           <sizepolicy hsizetype="Preferred" vsizetype="Preferred">
+            <horstretch>0</horstretch>
+            <verstretch>0</verstretch>
+           </sizepolicy>
+          </property>
+         </widget>
+        </item>
+        <item row="1" column="0">
+         <widget class="QLabel" name="label_9">
+          <property name="text">
+           <string>Diagonal FOV</string>
+          </property>
+         </widget>
+        </item>
+        <item row="0" column="0">
+         <widget class="QLabel" name="label_10">
+          <property name="text">
+           <string>Camera name</string>
+          </property>
+         </widget>
+        </item>
+        <item row="1" column="1">
+         <widget class="QSpinBox" name="cameraFOV">
+          <property name="sizePolicy">
+           <sizepolicy hsizetype="Preferred" vsizetype="Preferred">
+            <horstretch>0</horstretch>
+            <verstretch>0</verstretch>
+           </sizepolicy>
+          </property>
+          <property name="toolTip">
+           <string>Field of view. Needed to transform the pose to world coordinates.</string>
+          </property>
+          <property name="locale">
+           <locale language="English" country="UnitedStates"/>
+          </property>
+          <property name="minimum">
+           <number>35</number>
+          </property>
+          <property name="maximum">
+           <number>90</number>
+          </property>
+         </widget>
+        </item>
+        <item row="2" column="1">
+         <widget class="QComboBox" name="resolution">
+          <property name="sizePolicy">
+           <sizepolicy hsizetype="Preferred" vsizetype="Preferred">
+            <horstretch>0</horstretch>
+            <verstretch>0</verstretch>
+           </sizepolicy>
+          </property>
+          <property name="toolTip">
+           <string>The requested resolution for cases where the camera delivers maximum frame rate only for a particular resolution. The image may still be downscaled to the internal resolution.</string>
+          </property>
+         </widget>
+        </item>
+        <item row="2" column="0">
+         <widget class="QLabel" name="resolution_label">
+          <property name="text">
+           <string>Resolution</string>
+          </property>
+         </widget>
+        </item>
+       </layout>
+      </item>
+      <item>
+       <layout class="QGridLayout" name="gridLayout_6">
+        <property name="leftMargin">
+         <number>0</number>
+        </property>
+        <property name="topMargin">
+         <number>0</number>
+        </property>
+        <property name="rightMargin">
+         <number>0</number>
+        </property>
+        <property name="bottomMargin">
+         <number>0</number>
+        </property>
+        <property name="horizontalSpacing">
+         <number>0</number>
+        </property>
+        <property name="verticalSpacing">
+         <number>2</number>
+        </property>
+        <item row="4" column="1">
+         <widget class="QComboBox" name="cameraFPS">
+          <property name="sizePolicy">
+           <sizepolicy hsizetype="Preferred" vsizetype="Preferred">
+            <horstretch>0</horstretch>
+            <verstretch>0</verstretch>
+           </sizepolicy>
+          </property>
+          <property name="toolTip">
+           <string>Requested video frame rate. Actual setting may not be supported by the camera.</string>
+          </property>
+         </widget>
+        </item>
+        <item row="4" column="0">
+         <widget class="QLabel" name="label_12">
+          <property name="text">
+           <string>Frames per second</string>
+          </property>
+         </widget>
+        </item>
+        <item row="2" column="1">
+         <widget class="QCheckBox" name="use_mjpeg">
+          <property name="sizePolicy">
+           <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
+            <horstretch>0</horstretch>
+            <verstretch>0</verstretch>
+           </sizepolicy>
+          </property>
+          <property name="minimumSize">
+           <size>
+            <width>0</width>
+            <height>0</height>
+           </size>
+          </property>
+          <property name="text">
+           <string/>
+          </property>
+         </widget>
+        </item>
+        <item row="2" column="0">
+         <widget class="QLabel" name="label_11">
+          <property name="text">
+           <string>MJPEG</string>
+          </property>
+         </widget>
+        </item>
+        <item row="0" column="1">
+         <widget class="QPushButton" name="camera_settings">
+          <property name="sizePolicy">
+           <sizepolicy hsizetype="Preferred" vsizetype="Preferred">
+            <horstretch>0</horstretch>
+            <verstretch>0</verstretch>
+           </sizepolicy>
+          </property>
+          <property name="text">
+           <string>Camera settings</string>
+          </property>
+         </widget>
+        </item>
+       </layout>
+      </item>
+     </layout>
+    </widget>
+   </item>
+   <item row="7" column="0">
+    <widget class="QLabel" name="resolution_display">
+     <property name="autoFillBackground">
+      <bool>true</bool>
+     </property>
+     <property name="frameShape">
+      <enum>QFrame::Panel</enum>
+     </property>
+     <property name="frameShadow">
+      <enum>QFrame::Sunken</enum>
+     </property>
+     <property name="text">
+      <string notr="true"/>
+     </property>
+    </widget>
+   </item>
+   <item row="5" column="0">
+    <widget class="QGroupBox" name="tuningOptionsBox">
+     <property name="sizePolicy">
+      <sizepolicy hsizetype="Preferred" vsizetype="Preferred">
+       <horstretch>0</horstretch>
+       <verstretch>0</verstretch>
+      </sizepolicy>
+     </property>
+     <property name="minimumSize">
+      <size>
+       <width>0</width>
+       <height>0</height>
+      </size>
+     </property>
+     <property name="autoFillBackground">
+      <bool>true</bool>
+     </property>
+     <property name="title">
+      <string>Tuning / Debug</string>
+     </property>
+     <layout class="QVBoxLayout" name="verticalLayout">
+      <item>
+       <layout class="QHBoxLayout" name="horizontalLayout_3">
+        <item>
+         <widget class="QLabel" name="threadCountLabel">
+          <property name="text">
+           <string>Thread Count</string>
+          </property>
+         </widget>
+        </item>
+        <item>
+         <widget class="QSpinBox" name="threadCount">
+          <property name="toolTip">
+           <string>Number of threads. Can be used to balance the CPU load between the game and the tracker.</string>
+          </property>
+          <property name="minimum">
+           <number>1</number>
+          </property>
+          <property name="maximum">
+           <number>32</number>
+          </property>
+         </widget>
+        </item>
+        <item>
+         <widget class="Line" name="line">
+          <property name="orientation">
+           <enum>Qt::Vertical</enum>
+          </property>
+         </widget>
+        </item>
+        <item>
+         <widget class="QCheckBox" name="showNetworkInput">
+          <property name="sizePolicy">
+           <sizepolicy hsizetype="Minimum" vsizetype="Fixed">
+            <horstretch>0</horstretch>
+            <verstretch>0</verstretch>
+           </sizepolicy>
+          </property>
+          <property name="toolTip">
+           <string>Show the image patch that the pose estimation model sees.</string>
+          </property>
+          <property name="text">
+           <string>Show Network Input</string>
+          </property>
+         </widget>
+        </item>
+        <item>
+         <widget class="Line" name="line_2">
+          <property name="orientation">
+           <enum>Qt::Vertical</enum>
+          </property>
+         </widget>
+        </item>
+        <item>
+         <widget class="QLabel" name="roiFilterAlphaLabel">
+          <property name="sizePolicy">
+           <sizepolicy hsizetype="Minimum" vsizetype="Minimum">
+            <horstretch>0</horstretch>
+            <verstretch>0</verstretch>
+           </sizepolicy>
+          </property>
+          <property name="text">
+           <string>ROI Smoothing Alpha</string>
+          </property>
+         </widget>
+        </item>
+        <item>
+         <widget class="QDoubleSpinBox" name="roiFilterAlpha">
+          <property name="sizePolicy">
+           <sizepolicy hsizetype="Minimum" vsizetype="Fixed">
+            <horstretch>0</horstretch>
+            <verstretch>0</verstretch>
+           </sizepolicy>
+          </property>
+          <property name="maximumSize">
+           <size>
+            <width>150</width>
+            <height>16777215</height>
+           </size>
+          </property>
+          <property name="toolTip">
+           <string>Amount of smoothing of the face region coordinates. Can help stabilize the pose.</string>
+          </property>
+          <property name="wrapping">
+           <bool>false</bool>
+          </property>
+          <property name="decimals">
+           <number>2</number>
+          </property>
+          <property name="maximum">
+           <double>1.000000000000000</double>
+          </property>
+          <property name="singleStep">
+           <double>0.010000000000000</double>
+          </property>
+          <property name="value">
+           <double>1.000000000000000</double>
+          </property>
+         </widget>
+        </item>
+        <item>
+         <widget class="QLabel" name="roiZoomLabel">
+          <property name="text">
+           <string>ROI Zoom</string>
+          </property>
+         </widget>
+        </item>
+        <item>
+         <widget class="QDoubleSpinBox" name="roiZoom">
+          <property name="toolTip">
+           <string>Zoom factor for the face region. Applied before the patch is fed into the pose estimation model. There is a sweet spot near 1.</string>
+          </property>
+          <property name="minimum">
+           <double>0.100000000000000</double>
+          </property>
+          <property name="maximum">
+           <double>2.000000000000000</double>
+          </property>
+          <property name="singleStep">
+           <double>0.010000000000000</double>
+          </property>
+          <property name="value">
+           <double>1.000000000000000</double>
+          </property>
+         </widget>
+        </item>
+        <item>
+         <spacer name="horizontalSpacer">
+          <property name="orientation">
+           <enum>Qt::Horizontal</enum>
+          </property>
+          <property name="sizeHint" stdset="0">
+           <size>
+            <width>40</width>
+            <height>20</height>
+           </size>
+          </property>
+         </spacer>
+        </item>
+       </layout>
+      </item>
+      <item>
+       <widget class="QFrame" name="network_select_frame">
+        <property name="frameShape">
+         <enum>QFrame::NoFrame</enum>
+        </property>
+        <property name="frameShadow">
+         <enum>QFrame::Raised</enum>
+        </property>
+        <layout class="QHBoxLayout" name="horizontalLayout_2">
+         <property name="leftMargin">
+          <number>0</number>
+         </property>
+         <property name="topMargin">
+          <number>0</number>
+         </property>
+         <property name="rightMargin">
+          <number>0</number>
+         </property>
+         <property name="bottomMargin">
+          <number>0</number>
+         </property>
+         <item>
+          <widget class="QPushButton" name="posenetSelectButton">
+           <property name="toolTip">
+            <string>Select the pose network. Changes take affect on the next tracker start</string>
+           </property>
+           <property name="text">
+            <string>Select Pose Net ONNX</string>
+           </property>
+          </widget>
+         </item>
+         <item>
+          <widget class="QLabel" name="posenetFileDisplay">
+           <property name="sizePolicy">
+            <sizepolicy hsizetype="Expanding" vsizetype="Preferred">
+             <horstretch>0</horstretch>
+             <verstretch>0</verstretch>
+            </sizepolicy>
+           </property>
+           <property name="text">
+            <string>&lt;the pose net file&gt;</string>
+           </property>
+          </widget>
+         </item>
+        </layout>
+       </widget>
+      </item>
+     </layout>
+    </widget>
+   </item>
+  </layout>
+ </widget>
+ <resources/>
+ <connections/>
+ <designerdata>
+  <property name="gridDeltaX">
+   <number>10</number>
+  </property>
+  <property name="gridDeltaY">
+   <number>10</number>
+  </property>
+  <property name="gridSnapX">
+   <bool>false</bool>
+  </property>
+  <property name="gridSnapY">
+   <bool>false</bool>
+  </property>
+  <property name="gridVisible">
+   <bool>true</bool>
+  </property>
+ </designerdata>
+</ui>
diff --git a/tracker-neuralnet/opencv_contrib.h b/tracker-neuralnet/opencv_contrib.h
new file mode 100644
index 00000000..1c199025
--- /dev/null
+++ b/tracker-neuralnet/opencv_contrib.h
@@ -0,0 +1,120 @@
+#pragma once
+
+#include <opencv2/core.hpp>
+#include <opencv2/core/base.hpp>
+#include <opencv2/core/quaternion.hpp>
+
+// Well eventually it might be a contribution
+
+namespace cvcontrib
+{
+
+
+template<class T>
+cv::Point_<T> as_point(const cv::Size_<T>& s)
+{
+    return { s.width, s.height };
+}
+
+
+template<class T>
+cv::Size_<T> as_size(const cv::Point_<T>& p)
+{
+    return { p.x, p.y };
+}
+
+
+template<int n, int m>
+inline bool allfinite(const cv::Matx<float, n, m> &mat)
+{
+    const size_t sz = mat.rows*mat.cols;
+    for (size_t i=0; i<sz; ++i)
+        if (!std::isfinite(mat.val[i]))
+            return false;
+    return true;
+}
+
+
+// Because compiler refuses to convert it automatically
+template<int n>
+inline cv::Vec<float, n> to_vec(const cv::Matx<float, n, 1>& m)
+{
+    return cv::Vec<float,n>{m.val};
+}
+
+
+template<int n, int m, int o>
+inline void set_minor(cv::Vec<float, m> &dst, const int startrow, const cv::Matx<float, o, 1> &src)
+{
+    assert (startrow>=0 && startrow+n <= dst.rows);
+    for (int row=startrow, i=0; row<startrow+n; ++row,++i)
+    {
+        dst[row] = src(i,0);
+    }
+}
+
+
+template<int nrows, int ncols, int m, int n>
+inline void set_minor(cv::Matx<float, m, n>& dst, const int startrow, int startcol, const cv::Matx<float, nrows, ncols> &src)
+{
+    assert (startrow>=0 && startrow+nrows <= dst.rows);
+    assert (startcol>=0 && startcol+ncols <= dst.cols);
+    for (int row=startrow, i=0; row<startrow+nrows; ++row,++i)
+    {
+        for (int col=startcol, j=0; col<startcol+ncols; ++col,++j)
+        {
+            dst(row, col) = src(i,j);
+        }
+    }
+}
+
+
+inline cv::Quatf identity_quat()
+{
+    return cv::Quatf(1,0,0,0);
+}
+
+
+inline cv::Vec3f toRotVec(const cv::Quatf& q)
+{
+    // This is an improved implementation
+#if 1
+    // w = cos(alpha/2)
+    // xyz = sin(alpha/2)*axis
+    static constexpr float eps = 1.e-12f;
+    const cv::Vec3f xyz{q.x, q.y, q.z};
+    const float len = cv::norm(xyz);
+    const float angle = std::atan2(len, q.w)*2.f;
+    return xyz*(angle/(len+eps));
+#else
+    // The opencv implementation fails even the simplest test:
+    // out = toRVec(cv::Quatf{1., 0., 0., 0. });
+    // ASSERT_TRUE(std::isfinite(out[0]) && std::isfinite(out[1]) && std::isfinite(out[2]));    
+    return q.toRotVec();
+#endif
+}
+
+
+inline cv::Vec3f rotate(const cv::Quatf& q, const cv::Vec3f &v)
+{
+    const auto r = q * cv::Quatf{0., v[0], v[1], v[2]} * q.conjugate();
+    return { r.x, r.y, r.z };
+}
+
+
+template<int n>
+inline cv::Matx<float, n, n> cholesky(const cv::Matx<float, n, n>& mat)
+{
+    cv::Matx<float, n, n> l = mat;
+    // Der Code ist die Doku! 
+    // https://github.com/opencv/opencv/blob/4.5.4/modules/core/src/matrix_decomp.cpp#L95
+    cv::Cholesky(l.val, l.cols * sizeof(float), n, nullptr, 0, 0);
+    // It doesn't clear the upper triangle so we do it for it.
+    for (int row=0; row<n; ++row)
+        for (int col=row+1; col<n; ++col)
+            l(row, col) = 0.f;
+    return l;
+}
+
+
+} // namespace cvcontrib
+\ No newline at end of file
diff --git a/tracker-neuralnet/preview.cpp b/tracker-neuralnet/preview.cpp
new file mode 100644
index 00000000..76a6bbc0
--- /dev/null
+++ b/tracker-neuralnet/preview.cpp
@@ -0,0 +1,135 @@
+#include "preview.h"
+
+
+namespace neuralnet_tracker_ns
+{
+
+
+cv::Rect make_crop_rect_for_aspect(const cv::Size &size, int aspect_w, int aspect_h)
+{
+    auto [w, h] = size;
+    if ( w*aspect_h > aspect_w*h )
+    {
+        // Image is too wide
+        const int new_w = (aspect_w*h)/aspect_h;
+        return cv::Rect((w - new_w)/2, 0, new_w, h);
+    }
+    else
+    {
+        const int new_h = (aspect_h*w)/aspect_w;
+        return cv::Rect(0, (h - new_h)/2, w, new_h);
+    }
+}
+
+
+
+
+void Preview::init(const cv_video_widget& widget)
+{
+    auto [w,h] = widget.preview_size();
+    preview_size_ = { w, h };
+}
+
+
+void Preview::copy_video_frame(const cv::Mat& frame)
+{
+    cv::Rect roi = make_crop_rect_for_aspect(frame.size(), preview_size_.width, preview_size_.height);
+    
+    cv::resize(frame(roi), preview_image_, preview_size_, 0, 0, cv::INTER_NEAREST);
+
+    offset_ = { (float)-roi.x, (float)-roi.y };
+    scale_ = float(preview_image_.cols) / float(roi.width);
+}
+
+
+void Preview::draw_gizmos(
+    const std::optional<PoseEstimator::Face> &face,
+    const std::optional<cv::Rect2f>& last_roi,
+    const std::optional<cv::Rect2f>& last_localizer_roi,
+    const cv::Point2f& neckjoint_position)
+{
+    if (preview_image_.empty())
+        return;
+
+    if (last_roi) 
+    {
+        const int col = 255;
+        cv::rectangle(preview_image_, transform(*last_roi), cv::Scalar(0, col, 0), /*thickness=*/1);
+    }
+    if (last_localizer_roi)
+    {
+        const int col = 255;
+        cv::rectangle(preview_image_, transform(*last_localizer_roi), cv::Scalar(col, 0, 255-col), /*thickness=*/1);
+    }
+
+    if (face)
+    {
+        if (face->size>=1.f)
+            cv::circle(preview_image_, static_cast<cv::Point>(transform(face->center)), int(transform(face->size)), cv::Scalar(255,255,255), 2);
+        cv::circle(preview_image_, static_cast<cv::Point>(transform(face->center)), 3, cv::Scalar(255,255,255), -1);
+
+        const cv::Matx33f R = face->rotation.toRotMat3x3(cv::QUAT_ASSUME_UNIT);
+
+        auto draw_coord_line = [&](int i, const cv::Scalar& color)
+        {
+            const float vx = R(0,i);
+            const float vy = R(1,i);
+            static constexpr float len = 100.f;
+            cv::Point q = face->center + len*cv::Point2f{vx, vy};
+            cv::line(preview_image_, static_cast<cv::Point>(transform(face->center)), static_cast<cv::Point>(transform(q)), color, 2);
+        };
+        draw_coord_line(0, {0, 0, 255});
+        draw_coord_line(1, {0, 255, 0});
+        draw_coord_line(2, {255, 0, 0});
+
+            // Draw the computed joint position
+        auto xy = transform(neckjoint_position);
+        cv::circle(preview_image_, cv::Point(xy.x,xy.y), 5, cv::Scalar(0,0,255), -1);
+    }
+
+
+}
+
+void Preview::overlay_netinput(const cv::Mat& netinput)
+{
+    if (netinput.empty())
+        return;
+
+    const int w = std::min(netinput.cols, preview_image_.cols);
+    const int h = std::min(netinput.rows, preview_image_.rows);
+    cv::Rect roi(0, 0, w, h);
+    netinput(roi).copyTo(preview_image_(roi));
+}
+
+void Preview::draw_fps(double fps, double last_inference_time)
+{
+    char buf[128];
+    ::snprintf(buf, sizeof(buf), "%d Hz, pose inference: %d ms", std::clamp(int(fps), 0, 9999), int(last_inference_time));
+    cv::putText(preview_image_, buf, cv::Point(10, preview_image_.rows-10), cv::FONT_HERSHEY_PLAIN, 1, cv::Scalar(0, 255, 0), 1);
+}
+
+
+void Preview::copy_to_widget(cv_video_widget& widget)
+{
+    if (preview_image_.rows > 0)
+        widget.update_image(preview_image_);
+}
+
+
+cv::Rect2f Preview::transform(const cv::Rect2f& r) const
+{
+    return { (r.x - offset_.x)*scale_, (r.y - offset_.y)*scale_, r.width*scale_, r.height*scale_ };
+}
+
+cv::Point2f Preview::transform(const cv::Point2f& p) const
+{
+    return { (p.x - offset_.x)*scale_ , (p.y - offset_.y)*scale_ };
+}
+
+float Preview::transform(float s) const
+{
+    return s * scale_;
+}
+
+
+}
+\ No newline at end of file
diff --git a/tracker-neuralnet/preview.h b/tracker-neuralnet/preview.h
new file mode 100644
index 00000000..adc12993
--- /dev/null
+++ b/tracker-neuralnet/preview.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2021 Michael Welter <michael@welter-4d.de>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ */
+
+#pragma once
+
+#include "model_adapters.h"
+
+#include "cv/video-widget.hpp"
+
+#include <optional>
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+
+
+namespace neuralnet_tracker_ns
+{
+
+/** Makes a maximum size cropping rect with the given aspect. 
+*   @param aspect_w: nominator of the aspect ratio
+*   @param aspect_h: denom of the aspect ratio
+*/
+cv::Rect make_crop_rect_for_aspect(const cv::Size &size, int aspect_w, int aspect_h);
+
+
+/** This class is responsible for drawing the debug/info gizmos
+* 
+* In addition there function to transform the inputs to the size of
+* the preview image which can be different from the camera frame.
+*/
+class Preview
+{
+public:
+    void init(const cv_video_widget& widget);
+    void copy_video_frame(const cv::Mat& frame);
+    void draw_gizmos(
+        const std::optional<PoseEstimator::Face> &face,
+        const std::optional<cv::Rect2f>& last_roi,
+        const std::optional<cv::Rect2f>& last_localizer_roi,
+        const cv::Point2f& neckjoint_position);
+    void overlay_netinput(const cv::Mat& netinput);
+    void draw_fps(double fps, double last_inference_time);
+    void copy_to_widget(cv_video_widget& widget);
+private:
+    // Transform from camera image to preview
+    cv::Rect2f transform(const cv::Rect2f& r) const;
+    cv::Point2f transform(const cv::Point2f& p) const;
+    float transform(float s) const;
+
+    cv::Mat preview_image_;
+    cv::Size preview_size_ = { 0, 0 };
+    float scale_ = 1.f;  
+    cv::Point2f offset_ = { 0.f, 0.f};
+};
+
+} // neuralnet_tracker_ns
+\ No newline at end of file
diff --git a/tracker-neuralnet/redist/vcomp140.dll b/tracker-neuralnet/redist/vcomp140.dll
new file mode 100644
index 00000000..42c069b9
--- /dev/null
+++ b/tracker-neuralnet/redist/vcomp140.dll
diff --git a/tracker-neuralnet/tests.cpp b/tracker-neuralnet/tests.cpp
new file mode 100644
index 00000000..b1d2a6d0
--- /dev/null
+++ b/tracker-neuralnet/tests.cpp
@@ -0,0 +1,58 @@
+#include "model_adapters.h"
+
+#include <algorithm>
+#include <numeric>
+#include <cstdio>
+
+namespace neuralnet_tracker_tests
+{
+
+
+void assert_(bool ok, const std::string& msg)
+{
+    if (ok)
+        return;
+    std::cout << msg << std::endl;
+    std::exit(-1);
+}
+
+
+void test_find_input_intensity_quantile()
+{
+    cv::Mat data(10,10, CV_8UC1);
+    std::iota(data.begin<uint8_t>(), data.end<uint8_t>(), 0);
+
+    const float pct = 90;
+
+    const int val = neuralnet_tracker_ns::find_input_intensity_quantile(data, pct);
+    
+    assert_(val == int(10*10*pct/100.f), "test_find_input_intensity_quantile failed");
+}
+
+
+void test_normalize_brightness()
+{
+    cv::Mat data(10,10, CV_8UC1);
+    std::iota(data.begin<uint8_t>(), data.end<uint8_t>(), 0);
+
+    cv::Mat out;
+    neuralnet_tracker_ns::normalize_brightness(data, out);
+
+    auto [minit,maxit] = std::minmax_element(out.begin<float>(),out.end<float>());
+    const auto minval = *minit;
+    const auto maxval = *maxit;
+    assert_(std::abs(minval + 0.5f) < 0.02, "test_normalize_brightness failed");
+    // If the brightest value is lower than half-max, it will be boosted to half-max.
+    // Otherwise it will just be rescaled to [-.5, 0.5 ]. Here we have the low-brightness case.
+    assert_(std::abs(maxval - 0.0f) < 0.02, "test_normalize_brightness failed");
+}
+
+
+void run()
+{
+    test_find_input_intensity_quantile();
+    test_normalize_brightness();
+}
+
+
+}
+\ No newline at end of file
diff --git a/tracker-neuralnet/unscented_trafo.h b/tracker-neuralnet/unscented_trafo.h
new file mode 100644
index 00000000..267aa969
--- /dev/null
+++ b/tracker-neuralnet/unscented_trafo.h
@@ -0,0 +1,132 @@
+#pragma once
+
+#include <algorithm>
+#include <opencv2/core.hpp>
+#include <opencv2/core/base.hpp>
+#include <opencv2/core/quaternion.hpp>
+
+#include <cmath>
+#include <vector>
+
+#include "opencv_contrib.h"
+
+namespace ukf_cv
+{
+
+using namespace cvcontrib;
+
+template<int dim, int otherdim = dim>
+using SigmaPoints = std::array<cv::Vec<float,otherdim>,dim*2+1>;
+
+
+// Ported from 
+// https://filterpy.readthedocs.io/en/latest/_modules/filterpy/kalman/sigma_points.html
+// Excerpt from the original docu:
+// "
+
+// Generates sigma points and weights according to Van der Merwe's
+// 2004 dissertation[1] for the UnscentedKalmanFilter class.. It
+// parametizes the sigma points using alpha, beta, kappa terms, and
+// is the version seen in most publications.
+
+// Unless you know better, this should be your default choice.
+
+// alpha : float
+//     Determins the spread of the sigma points around the mean.
+//     Usually a small positive value (1e-3) according to [3].
+
+// beta : float
+//     Incorporates prior knowledge of the distribution of the mean. For
+//     Gaussian x beta=2 is optimal, according to [3].
+
+// kappa : float, default=0.0
+//     Secondary scaling parameter usually set to 0 according to [4],
+//     or to 3-n according to [5].
+
+// Reference
+// .. [1] R. Van der Merwe "Sigma-Point Kalman Filters for Probabilitic
+//        Inference in Dynamic State-Space Models" (Doctoral dissertation)
+
+// "
+template<int dim>
+class MerweScaledSigmaPoints
+{
+public:
+    static constexpr int num_sigmas = 2*dim+1;
+    
+    using Vector = cv::Vec<float,dim>;
+    using Matrix = cv::Matx<float,dim,dim>;
+
+    MerweScaledSigmaPoints(float alpha = 0.01, float beta = 2., int kappa = 3-dim)
+    {
+        lambda = alpha*alpha * (dim + kappa) - dim;
+        const float c = .5 / (dim + lambda);
+        Wc_i = c;
+        Wm_i = c;
+        Wm_0 = lambda / (dim+lambda);
+        Wc_0 = Wm_0 + (1.-alpha*alpha + beta);
+    }
+
+    SigmaPoints<dim> compute_sigmas(const Vector &mu, const Matrix &mat, bool is_tril_factor) const
+    {
+        const Matrix triu_factor = is_tril_factor ? mat.t() : cholesky(mat).t();
+
+        const Matrix U = triu_factor*std::sqrt(lambda+dim);
+
+        SigmaPoints<dim> sigmas;
+
+        sigmas[0] = mu;
+        for (int k=0; k<dim; ++k)
+        {
+            sigmas[k+1] = to_vec(mu + U.row(k).t());
+            sigmas[dim+k+1] = to_vec(mu - U.row(k).t());
+        }
+        return sigmas;
+    }
+
+    template<int otherdim>
+    std::tuple<cv::Vec<float,otherdim> , cv::Matx<float,otherdim,otherdim>> compute_statistics(const SigmaPoints<dim,otherdim> &sigmas) const
+    {
+        cv::Vec<float,otherdim> mu{}; // Zero initializes
+        for (size_t i=0; i<sigmas.size(); ++i)
+        {
+            mu += to_vec((i==0 ? Wm_0 : Wm_i) * sigmas[i]);
+        }
+
+        cv::Matx<float,otherdim,otherdim> cov{};
+        for (size_t i=0; i<sigmas.size(); ++i)
+        {
+            const auto p = sigmas[i] - mu;
+            cov += (i==0 ? Wc_0 : Wc_i)*p*p.t();
+        }
+
+        return { mu, cov };
+    }
+
+    template<int otherdim>
+    cv::Matx<float,dim,otherdim> compute_cov(const SigmaPoints<dim,dim> &sigmas, const SigmaPoints<dim,otherdim> &othersigmas) const
+    {
+        cv::Vec<float,dim> mu{}; // Zero initializes
+        cv::Vec<float,otherdim> mu_other{}; // Zero initializes
+        for (size_t i=0; i<sigmas.size(); ++i)
+        {
+            mu += to_vec((i==0 ? Wm_0 : Wm_i) * sigmas[i]);
+            mu_other += to_vec((i==0 ? Wm_0 : Wm_i) * othersigmas[i]);
+        }
+
+        cv::Matx<float,dim,otherdim> cov{};
+        for (size_t i=0; i<sigmas.size(); ++i)
+        {
+            const auto p = sigmas[i] - mu;
+            const auto q = othersigmas[i] - mu_other;
+            cov += (i==0 ? Wc_0 : Wc_i)*p*q.t();
+        }
+
+        return cov;
+    }
+private:
+    float Wc_i, Wm_i, Wm_0, Wc_0, lambda;
+};
+
+
+} // namespace ukf_cv
+\ No newline at end of file