summaryrefslogtreecommitdiffhomepage
path: root/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'tracker-neuralnet/ftnoir_tracker_neuralnet.cpp')
-rw-r--r--tracker-neuralnet/ftnoir_tracker_neuralnet.cpp272
1 files changed, 212 insertions, 60 deletions
diff --git a/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp b/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp
index 06874e6c..00f3f281 100644
--- a/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp
+++ b/tracker-neuralnet/ftnoir_tracker_neuralnet.cpp
@@ -16,6 +16,7 @@
#include <opencv2/imgcodecs.hpp>
#include "compat/timer.hpp"
#include <omp.h>
+#include <stdexcept>
#ifdef _MSC_VER
# pragma warning(disable : 4702)
@@ -29,6 +30,7 @@
#include <cmath>
#include <algorithm>
#include <chrono>
+#include <string>
// Some demo code for onnx
// https://github.com/microsoft/onnxruntime/blob/master/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/C_Api_Sample.cpp
@@ -43,9 +45,6 @@ using numeric_types::vec2;
using numeric_types::mat33;
using quat = std::array<numeric_types::f,4>;
-// Minimal difference if at all going from 1 to 2 threads.
-static constexpr int num_threads = 1;
-
#if _MSC_VER
std::wstring convert(const QString &s) { return s.toStdWString(); }
@@ -69,6 +68,41 @@ cv::Rect_<T> squarize(const cv::Rect_<T> &r)
}
+template<class T>
+cv::Point_<T> as_point(const cv::Size_<T>& s)
+{
+ return { s.width, s.height };
+}
+
+
+template<class T>
+cv::Size_<T> as_size(const cv::Point_<T>& p)
+{
+ return { p.x, p.y };
+}
+
+
+template<class T>
+cv::Rect_<T> expand(const cv::Rect_<T>& r, T factor)
+{
+ // xnew = l+.5*w - w*f*0.5 = l + .5*(w - new_w)
+ const cv::Size_<T> new_size = { r.width * factor, r.height * factor };
+ const cv::Point_<T> new_tl = r.tl() + (as_point(r.size()) - as_point(new_size)) / T(2);
+ return cv::Rect_<T>(new_tl, new_size);
+}
+
+
+template<class T>
+cv::Rect_<T> ewa_filter(const cv::Rect_<T>& last, const cv::Rect_<T>& current, T alpha)
+{
+ const auto last_center = T(0.5) * (last.tl() + last.br());
+ const auto cur_center = T(0.5) * (current.tl() + current.br());
+ const cv::Point_<T> new_size = as_point(last.size()) + alpha * (as_point(current.size()) - as_point(last.size()));
+ const cv::Point_<T> new_center = last_center + alpha * (cur_center - last_center);
+ return cv::Rect_<T>(new_center - T(0.5) * new_size, as_size(new_size));
+}
+
+
cv::Rect2f unnormalize(const cv::Rect2f &r, int h, int w)
{
auto unnorm = [](float x) -> float { return 0.5*(x+1); };
@@ -138,6 +172,30 @@ mat33 quaternion_to_mat33(const std::array<float,4> quat)
}
+vec3 rotate_vec(const quat& q, const vec3& p)
+{
+ const float qw = q[0];
+ const float qi = q[1];
+ const float qj = q[2];
+ const float qk = q[3];
+ const float pi = p[0];
+ const float pj = p[1];
+ const float pk = p[2];
+ const quat tmp{
+ - qi*pi - qj*pj - qk*pk,
+ qw*pi + qj*pk - qk*pj,
+ qw*pj - qi*pk + qk*pi,
+ qw*pk + qi*pj - qj*pi
+ };
+ const vec3 out {
+ -tmp[0]*qi + tmp[1]*qw - tmp[2]*qk + tmp[3]*qj,
+ -tmp[0]*qj + tmp[1]*qk + tmp[2]*qw - tmp[3]*qi,
+ -tmp[0]*qk - tmp[1]*qj + tmp[2]*qi + tmp[3]*qw
+ };
+ return out;
+}
+
+
vec3 image_to_world(float x, float y, float size, float reference_size_in_mm, const cv::Size2i& image_size, const CamIntrinsics& intrinsics)
{
// Compute the location the network outputs in 3d space.
@@ -186,8 +244,8 @@ T iou(const cv::Rect_<T> &a, const cv::Rect_<T> &b)
// have the shape B x C x H x W, where B=C=1.
cv::Size get_input_image_shape(const Ort::Session &session)
{
- if (session.GetInputCount() != 1)
- throw std::invalid_argument("Model must take exactly one input tensor");
+ if (session.GetInputCount() < 1)
+ throw std::invalid_argument("Model must take at least one input tensor");
const std::vector<std::int64_t> shape =
session.GetInputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape();
if (shape.size() != 4)
@@ -196,6 +254,17 @@ cv::Size get_input_image_shape(const Ort::Session &session)
}
+Ort::Value create_tensor(const Ort::TypeInfo& info, Ort::Allocator& alloc)
+{
+ const auto shape = info.GetTensorTypeAndShapeInfo().GetShape();
+ auto t = Ort::Value::CreateTensor<float>(
+ alloc, shape.data(), shape.size());
+ memset(t.GetTensorMutableData<float>(), 0, sizeof(float)*info.GetTensorTypeAndShapeInfo().GetElementCount());
+ return t;
+}
+
+
+
int enum_to_fps(int value)
{
switch (value)
@@ -241,10 +310,7 @@ std::pair<float, cv::Rect2f> Localizer::run(
Timer t; t.start();
- const auto nt = omp_get_num_threads();
- omp_set_num_threads(num_threads);
session.Run(Ort::RunOptions{nullptr}, input_names, &input_val, 1, output_names, &output_val, 1);
- omp_set_num_threads(nt);
last_inference_time = t.elapsed_ms();
@@ -267,36 +333,85 @@ double Localizer::last_inference_time_millis() const
PoseEstimator::PoseEstimator(Ort::MemoryInfo &allocator_info, Ort::Session &&_session)
- : session{std::move(_session)}
- , model_version(session.GetModelMetadata().GetVersion())
+ : model_version{_session.GetModelMetadata().GetVersion()}
+ , session{std::move(_session)}
+ , allocator{session, allocator_info}
{
+ using namespace std::literals::string_literals;
+
+ if (session.GetOutputCount() < 2)
+ throw std::runtime_error("Invalid Model: must have at least two outputs");
+
+ // WARNING
+ // If the model was saved without meta data, it seems the version field is uninitialized.
+ // In that case reading from it is UB. However, we will just get same arbitrary number
+ // which is hopefully different from the numbers used by models where the version is set.
+ if (model_version != 2)
+ model_version = 1;
+
const cv::Size input_image_shape = get_input_image_shape(session);
scaled_frame = cv::Mat(input_image_shape, CV_8U);
- input_mat = cv::Mat(input_image_shape, CV_32F);
+ input_mat = cv::Mat(input_image_shape, CV_32F);
{
const std::int64_t input_shape[4] = { 1, 1, input_image_shape.height, input_image_shape.width };
- input_val = Ort::Value::CreateTensor<float>(allocator_info, input_mat.ptr<float>(0), input_mat.total(), input_shape, 4);
+ input_val.push_back(
+ Ort::Value::CreateTensor<float>(allocator_info, input_mat.ptr<float>(0), input_mat.total(), input_shape, 4));
}
{
const std::int64_t output_shape[2] = { 1, 3 };
- output_val[0] = Ort::Value::CreateTensor<float>(
- allocator_info, &output_coord[0], output_coord.rows, output_shape, 2);
+ output_val.push_back(Ort::Value::CreateTensor<float>(
+ allocator_info, &output_coord[0], output_coord.rows, output_shape, 2));
}
{
const std::int64_t output_shape[2] = { 1, 4 };
- output_val[1] = Ort::Value::CreateTensor<float>(
- allocator_info, &output_quat[0], output_quat.rows, output_shape, 2);
+ output_val.push_back(Ort::Value::CreateTensor<float>(
+ allocator_info, &output_quat[0], output_quat.rows, output_shape, 2));
}
+ size_t num_regular_outputs = 2;
+
+ if (session.GetOutputCount() >= 3 && "box"s == session.GetOutputName(2, allocator))
{
const std::int64_t output_shape[2] = { 1, 4 };
- output_val[2] = Ort::Value::CreateTensor<float>(
- allocator_info, &output_box[0], output_box.rows, output_shape, 2);
+ output_val.push_back(Ort::Value::CreateTensor<float>(
+ allocator_info, &output_box[0], output_box.rows, output_shape, 2));
+ ++num_regular_outputs;
+ qDebug() << "Note: Legacy model output for face ROI is currently ignored";
+ }
+
+ num_recurrent_states = session.GetInputCount()-1;
+ if (session.GetOutputCount()-num_regular_outputs != num_recurrent_states)
+ throw std::runtime_error("Invalid Model: After regular inputs and outputs the model must have equal number of inputs and outputs for tensors holding hidden states of recurrent layers.");
+
+ // Create tensors for recurrent state
+ for (size_t i = 0; i < num_recurrent_states; ++i)
+ {
+ const auto& input_info = session.GetInputTypeInfo(1+i);
+ const auto& output_info = session.GetOutputTypeInfo(num_regular_outputs+i);
+ if (input_info.GetTensorTypeAndShapeInfo().GetShape() !=
+ output_info.GetTensorTypeAndShapeInfo().GetShape())
+ throw std::runtime_error("Invalid Model: Tensors for recurrent hidden states should have same shape on intput and output");
+ input_val.push_back(create_tensor(input_info, allocator));
+ output_val.push_back(create_tensor(output_info, allocator));
}
+
+ for (size_t i = 0; i < session.GetInputCount(); ++i)
+ {
+ input_names.push_back(session.GetInputName(i, allocator));
+ }
+ for (size_t i = 0; i < session.GetOutputCount(); ++i)
+ {
+ output_names.push_back(session.GetOutputName(i, allocator));
+ }
+
+ qDebug() << "Model inputs: " << session.GetInputCount() << ", outputs: " << session.GetOutputCount() << ", recurrent states: " << num_recurrent_states;
+
+ assert (input_names.size() == input_val.size());
+ assert (output_names.size() == output_val.size());
}
@@ -336,7 +451,7 @@ std::optional<PoseEstimator::Face> PoseEstimator::run(
};
cv::getRectSubPix(frame, {patch_size, patch_size}, patch_center, cropped);
- // Will get failure if patch_center is outside image boundaries.
+ // Will get failure if patch_center is outside image boundariesettings.
// Have to catch this case.
if (cropped.rows != patch_size || cropped.cols != patch_size)
return {};
@@ -355,26 +470,37 @@ std::optional<PoseEstimator::Face> PoseEstimator::run(
assert (input_mat.ptr(0) == p);
assert (!input_mat.empty() && input_mat.isContinuous());
- const char* input_names[] = {"x"};
- const char* output_names[] = {"pos_size", "quat", "box"};
Timer t; t.start();
- const auto nt = omp_get_num_threads();
- omp_set_num_threads(num_threads);
try
{
- session.Run(Ort::RunOptions{ nullptr }, input_names, &input_val, 1, output_names, output_val, 3);
+ session.Run(
+ Ort::RunOptions{ nullptr },
+ input_names.data(),
+ input_val.data(),
+ input_val.size(),
+ output_names.data(),
+ output_val.data(),
+ output_val.size());
}
catch (const Ort::Exception &e)
{
qDebug() << "Failed to run the model: " << e.what();
- omp_set_num_threads(nt);
return {};
}
- omp_set_num_threads(nt);
- // FIXME: Execution time fluctuates wildly. 19 to 26 ms. Why???
+ for (size_t i = 0; i<num_recurrent_states; ++i)
+ {
+ // Next step, the current output becomes the input.
+ // Thus we realize the recurrent connection.
+ // Only swaps the internal pointers. There is no copy of data.
+ std::swap(
+ output_val[output_val.size()-num_recurrent_states+i],
+ input_val[input_val.size()-num_recurrent_states+i]);
+ }
+
+ // FIXME: Execution time fluctuates wildly. 19 to 26 msec. Why?
// The instructions are always the same. Maybe a memory allocation
// issue. The ONNX api suggests that tensor are allocated in an
// arena. Does that matter? Maybe the issue is something else?
@@ -383,7 +509,7 @@ std::optional<PoseEstimator::Face> PoseEstimator::run(
// Perform coordinate transformation.
// From patch-local normalized in [-1,1] to
- // frame unnormalized pixel coordinates.
+ // frame unnormalized pixel coordinatesettings.
const cv::Point2f center = patch_center +
(0.5f*patch_size)*cv::Point2f{output_coord[0], output_coord[1]};
@@ -462,7 +588,7 @@ bool neuralnet_tracker::detect()
auto face = poseestimator->run(grayscale, *last_roi);
last_inference_time += poseestimator->last_inference_time_millis();
-
+
if (!face)
{
last_roi.reset();
@@ -470,7 +596,23 @@ bool neuralnet_tracker::detect()
return false;
}
- last_roi = face->box;
+ {
+ // Here: compute ROI from head size estimate. This helps make the size prediction more
+ // invariant to mouth opening. The tracking can be lost more often at extreme
+ // rotations, depending on the implementation details. The code down here has
+ // been tweaked so that it works pretty well.
+ // In old behaviour ROI is taken from the model outputs
+ const vec3 offset = rotate_vec(face->rotation, vec3{0.f, 0.1f*face->size, face->size*0.3f});
+ const float halfsize = face->size/float(settings.roi_zoom);
+ face->box = cv::Rect2f(
+ face->center.x + offset[0] - halfsize,
+ face->center.y + offset[1] - halfsize,
+ halfsize*2.f,
+ halfsize*2.f
+ );
+ }
+
+ last_roi = ewa_filter(*last_roi, face->box, float(settings.roi_filter_alpha));
Affine pose = compute_pose(*face);
@@ -487,6 +629,8 @@ bool neuralnet_tracker::detect()
Affine neuralnet_tracker::compute_pose(const PoseEstimator::Face &face) const
{
+ // Compute the location the network outputs in 3d space.
+
const mat33 rot_correction = compute_rotation_correction(
normalize(face.center, frame.rows, frame.cols),
intrinsics.focal_length_w);
@@ -507,7 +651,6 @@ Affine neuralnet_tracker::compute_pose(const PoseEstimator::Face &face) const
------------------------
*/
- // Compute the location the network outputs in 3d space.
const vec3 face_world_pos = image_to_world(
face.center.x, face.center.y, face.size, head_size_mm,
frame.size(),
@@ -519,9 +662,9 @@ Affine neuralnet_tracker::compute_pose(const PoseEstimator::Face &face) const
const vec3 pos = face_world_pos
+ m * vec3{
- static_cast<float>(s.offset_fwd),
- static_cast<float>(s.offset_up),
- static_cast<float>(s.offset_right)};
+ static_cast<float>(settings.offset_fwd),
+ static_cast<float>(settings.offset_up),
+ static_cast<float>(settings.offset_right)};
return { m, pos };
}
@@ -566,7 +709,7 @@ void neuralnet_tracker::draw_gizmos(
cv::circle(frame, cv::Point(xy[0],xy[1]), 5, cv::Scalar(0,0,255), -1);
}
- if (s.show_network_input)
+ if (settings.show_network_input)
{
cv::Mat netinput = poseestimator->last_network_input();
if (!netinput.empty())
@@ -587,7 +730,6 @@ void neuralnet_tracker::draw_gizmos(
neuralnet_tracker::neuralnet_tracker()
{
opencv_init();
- cv::setNumThreads(num_threads);
}
@@ -609,6 +751,8 @@ module_status neuralnet_tracker::start_tracker(QFrame* videoframe)
layout->addWidget(&*videoWidget);
videoframe->setLayout(&*layout);
videoWidget->show();
+ num_threads = settings.num_threads;
+ cv::setNumThreads(num_threads);
start();
return status_ok();
}
@@ -633,7 +777,7 @@ bool neuralnet_tracker::load_and_initialize_model()
// openmp settings. Which is what we do. omp_set_num_threads directly
// before running the inference pass.
opts.SetIntraOpNumThreads(num_threads);
- opts.SetInterOpNumThreads(num_threads);
+ opts.SetInterOpNumThreads(1);
allocator_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
localizer.emplace(
@@ -656,11 +800,11 @@ bool neuralnet_tracker::load_and_initialize_model()
bool neuralnet_tracker::open_camera()
{
- int fps = enum_to_fps(s.force_fps);
+ int fps = enum_to_fps(settings.force_fps);
QMutexLocker l(&camera_mtx);
- camera = video::make_camera(s.camera_name);
+ camera = video::make_camera(settings.camera_name);
if (!camera)
return false;
@@ -673,7 +817,7 @@ bool neuralnet_tracker::open_camera()
if (fps)
args.fps = fps;
- args.use_mjpeg = s.use_mjpeg;
+ args.use_mjpeg = settings.use_mjpeg;
if (!camera->start(args))
{
@@ -687,7 +831,7 @@ bool neuralnet_tracker::open_camera()
void neuralnet_tracker::set_intrinsics()
{
const int w = grayscale.cols, h = grayscale.rows;
- const double diag_fov = s.fov * M_PI / 180.;
+ const double diag_fov = settings.fov * M_PI / 180.;
const double fov_w = 2.*atan(tan(diag_fov/2.)/sqrt(1. + h/(double)w * h/(double)w));
const double fov_h = 2.*atan(tan(diag_fov/2.)/sqrt(1. + w/(double)h * w/(double)h));
const double focal_length_w = 1. / tan(.5 * fov_w);
@@ -746,8 +890,13 @@ void neuralnet_tracker::run()
set_intrinsics();
+ const auto nt = omp_get_num_threads();
+ omp_set_num_threads(num_threads);
+
detect();
+ omp_set_num_threads(nt);
+
if (frame.rows > 0)
videoWidget->update_image(frame);
@@ -823,26 +972,29 @@ neuralnet_dialog::neuralnet_dialog() :
ui.setupUi(this);
make_fps_combobox();
- tie_setting(s.force_fps, ui.cameraFPS);
+ tie_setting(settings.force_fps, ui.cameraFPS);
for (const auto& str : video::camera_names())
ui.cameraName->addItem(str);
- tie_setting(s.camera_name, ui.cameraName);
- tie_setting(s.fov, ui.cameraFOV);
- tie_setting(s.offset_fwd, ui.tx_spin);
- tie_setting(s.offset_up, ui.ty_spin);
- tie_setting(s.offset_right, ui.tz_spin);
- tie_setting(s.show_network_input, ui.showNetworkInput);
- tie_setting(s.use_mjpeg, ui.use_mjpeg);
+ tie_setting(settings.camera_name, ui.cameraName);
+ tie_setting(settings.fov, ui.cameraFOV);
+ tie_setting(settings.offset_fwd, ui.tx_spin);
+ tie_setting(settings.offset_up, ui.ty_spin);
+ tie_setting(settings.offset_right, ui.tz_spin);
+ tie_setting(settings.show_network_input, ui.showNetworkInput);
+ tie_setting(settings.roi_filter_alpha, ui.roiFilterAlpha);
+ tie_setting(settings.use_mjpeg, ui.use_mjpeg);
+ tie_setting(settings.roi_zoom, ui.roiZoom);
+ tie_setting(settings.num_threads, ui.threadCount);
connect(ui.buttonBox, SIGNAL(accepted()), this, SLOT(doOK()));
connect(ui.buttonBox, SIGNAL(rejected()), this, SLOT(doCancel()));
connect(ui.camera_settings, SIGNAL(clicked()), this, SLOT(camera_settings()));
- connect(&s.camera_name, value_::value_changed<QString>(), this, &neuralnet_dialog::update_camera_settings_state);
+ connect(&settings.camera_name, value_::value_changed<QString>(), this, &neuralnet_dialog::update_camera_settings_state);
- update_camera_settings_state(s.camera_name);
+ update_camera_settings_state(settings.camera_name);
connect(&calib_timer, &QTimer::timeout, this, &neuralnet_dialog::trans_calib_step);
calib_timer.setInterval(35);
@@ -852,7 +1004,7 @@ neuralnet_dialog::neuralnet_dialog() :
void neuralnet_dialog::doOK()
{
- s.b->save();
+ settings.b->save();
close();
}
@@ -871,7 +1023,7 @@ void neuralnet_dialog::camera_settings()
(void)tracker->camera->show_dialog();
}
else
- (void)video::show_dialog(s.camera_name);
+ (void)video::show_dialog(settings.camera_name);
}
@@ -942,9 +1094,9 @@ void neuralnet_dialog::startstop_trans_calib(bool start)
trans_calib.reset();
ui.sample_count_display->setText(QString());
// Tracker must run with zero'ed offset for calibration.
- s.offset_fwd = 0;
- s.offset_up = 0;
- s.offset_right = 0;
+ settings.offset_fwd = 0;
+ settings.offset_up = 0;
+ settings.offset_right = 0;
}
else
{
@@ -952,9 +1104,9 @@ void neuralnet_dialog::startstop_trans_calib(bool start)
qDebug() << "pt: stopping translation calibration";
{
auto [tmp, nsamples] = trans_calib.get_estimate();
- s.offset_fwd = int(tmp[0]);
- s.offset_up = int(tmp[1]);
- s.offset_right = int(tmp[2]);
+ settings.offset_fwd = int(tmp[0]);
+ settings.offset_up = int(tmp[1]);
+ settings.offset_right = int(tmp[2]);
}
}
ui.tx_spin->setEnabled(!start);
@@ -968,7 +1120,7 @@ void neuralnet_dialog::startstop_trans_calib(bool start)
}
-settings::settings() : opts("neuralnet-tracker") {}
+Settings::Settings() : opts("neuralnet-tracker") {}
} // neuralnet_tracker_ns