/* Copyright (c) 2021 Michael Welter * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. */ #include "ftnoir_tracker_neuralnet.h" #include "deadzone_filter.h" #include "opencv_contrib.h" #include "compat/sleep.hpp" #include "compat/math-imports.hpp" #include "compat/timer.hpp" #include "compat/check-visible.hpp" #include "compat/camera-names.hpp" #include "cv/init.hpp" #include #include #include #include #ifdef _MSC_VER # pragma warning(disable : 4702) #endif #include #include #include #include #include #include #include #include #include #include // Some demo code for onnx // https://github.com/microsoft/onnxruntime/blob/master/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/C_Api_Sample.cpp // https://github.com/leimao/ONNX-Runtime-Inference/blob/main/src/inference.cpp namespace neuralnet_tracker_ns { using namespace cvcontrib; using numeric_types::vec3; using numeric_types::vec2; using numeric_types::mat33; #if _MSC_VER std::wstring convert(const QString &s) { return s.toStdWString(); } #else std::string convert(const QString &s) { return s.toStdString(); } #endif int enum_to_fps(int value) { switch (value) { case fps_30: return 30; case fps_60: return 60; default: [[fallthrough]]; case fps_default: return 0; } } template struct OnScopeExit { explicit OnScopeExit(F&& f) : f_{ f } {} ~OnScopeExit() noexcept { f_(); } F f_; }; CamIntrinsics make_intrinsics(const cv::Mat& img, const Settings& settings) { const int w = img.cols, h = img.rows; //const double diag_fov = settings.fov * M_PI / 180.; const double diag_fov = 60 * M_PI / 180.; (void)settings; const double fov_w = 2.*atan(tan(diag_fov/2.)/sqrt(1. + h/(double)w * h/(double)w)); const double fov_h = 2.*atan(tan(diag_fov/2.)/sqrt(1. + w/(double)h * w/(double)h)); const double focal_length_w = 1. / tan(.5 * fov_w); const double focal_length_h = 1. / tan(.5 * fov_h); /* a ______ <--- here is sensor area | / | / f | / | / 2 x angle is the fov |/ <--- here is the hole of the pinhole camera So, a / f = tan(fov / 2) => f = a/tan(fov/2) What is a? 1 if we define f in terms of clip space where the image plane goes from -1 to 1. Because a is the half-width. */ return { (float)focal_length_w, (float)focal_length_h, (float)fov_w, (float)fov_h }; } cv::Rect make_crop_rect_multiple_of(const cv::Size &size, int multiple) { const int new_w = (size.width / multiple) * multiple; const int new_h = (size.height / multiple) * multiple; return cv::Rect( (size.width-new_w)/2, (size.height-new_h)/2, new_w, new_h ); } template cv::Rect_ squarize(const cv::Rect_ &r) { cv::Point_ c{r.x + r.width/T(2), r.y + r.height/T(2)}; const T sz = std::max(r.height, r.width); return {c.x - sz/T(2), c.y - sz/T(2), sz, sz}; } template cv::Rect_ expand(const cv::Rect_& r, T factor) { // xnew = l+.5*w - w*f*0.5 = l + .5*(w - new_w) const cv::Size_ new_size = { r.width * factor, r.height * factor }; const cv::Point_ new_tl = r.tl() + (as_point(r.size()) - as_point(new_size)) / T(2); return cv::Rect_(new_tl, new_size); } template cv::Rect_ ewa_filter(const cv::Rect_& last, const cv::Rect_& current, T alpha) { const auto last_center = T(0.5) * (last.tl() + last.br()); const auto cur_center = T(0.5) * (current.tl() + current.br()); const cv::Point_ new_size = as_point(last.size()) + alpha * (as_point(current.size()) - as_point(last.size())); const cv::Point_ new_center = last_center + alpha * (cur_center - last_center); return cv::Rect_(new_center - T(0.5) * new_size, as_size(new_size)); } cv::Vec3f image_to_world(float x, float y, float size, float reference_size_in_mm, const cv::Size2i& image_size, const CamIntrinsics& intrinsics) { /* Compute the location the network outputs in 3d space. hhhhhh <- head size (meters) \ | ----------------------- \ | \ \ | | \ | |- x (meters) ____ <- face.size / width | \ | | | \| |- focal length / ------------------------ ------------------------------------------------>> z direction z/x = zi / f zi = image position z = world position f = focal length We can also do deltas: dz / x = dzi / f => x = dz / dzi * f which means we can compute x from the head size (dzi) if we assume some reference size (dz). */ const float head_size_vertical = 2.f*size; // Size from the model is more like half the real vertical size of a human head. const float xpos = -(intrinsics.focal_length_w * image_size.width * 0.5f) / head_size_vertical * reference_size_in_mm; const float zpos = (x / image_size.width * 2.f - 1.f) * xpos / intrinsics.focal_length_w; const float ypos = (y / image_size.height * 2.f - 1.f) * xpos / intrinsics.focal_length_h; return {xpos, ypos, zpos}; } vec2 world_to_image(const cv::Vec3f& pos, const cv::Size2i& image_size, const CamIntrinsics& intrinsics) { const float xscr = pos[2] / pos[0] * intrinsics.focal_length_w; const float yscr = pos[1] / pos[0] * intrinsics.focal_length_h; const float x = (xscr+1.)*0.5f*image_size.width; const float y = (yscr+1.)*0.5f*image_size.height; return {x, y}; } cv::Quatf image_to_world(cv::Quatf q) { std::swap(q[1], q[3]); q[1] = -q[1]; q[2] = -q[2]; q[3] = -q[3]; return q; } cv::Point2f normalize(const cv::Point2f &p, int h, int w) { return { p.x/w*2.f-1.f, p.y/h*2.f-1.f }; } cv::Quatf rotation_from_two_vectors(const vec3 &a, const vec3 &b) { // |axis| = |a| * |b| * sin(alpha) const vec3 axis = a.cross(b); // dot = |a|*|b|*cos(alpha) const float dot = a.dot(b); const float len = cv::norm(axis); vec3 normed_axis = axis / len; float angle = std::atan2(len, dot); if (!(std::isfinite(normed_axis[0]) && std::isfinite(normed_axis[1]) && std::isfinite(normed_axis[2]))) { angle = 0.f; normed_axis = vec3{1.,0.,0.}; } return cv::Quatf::createFromAngleAxis(angle, normed_axis); } // Computes correction due to head being off screen center. cv::Quatf compute_rotation_correction(const cv::Point3f& p) { return rotation_from_two_vectors( {-1.f,0.f,0.f}, p); } // Intersection over union. A value between 0 and 1 which measures the match between the bounding boxes. template T iou(const cv::Rect_ &a, const cv::Rect_ &b) { auto i = a & b; return double{i.area()} / (a.area()+b.area()-i.area()); } class GuardedThreadCountSwitch { int old_num_threads_cv_ = 1; int old_num_threads_omp_ = 1; public: GuardedThreadCountSwitch(int num_threads) { old_num_threads_cv_ = cv::getNumThreads(); old_num_threads_omp_ = omp_get_num_threads(); omp_set_num_threads(num_threads); cv::setNumThreads(num_threads); } ~GuardedThreadCountSwitch() { omp_set_num_threads(old_num_threads_omp_); cv::setNumThreads(old_num_threads_cv_); } GuardedThreadCountSwitch(const GuardedThreadCountSwitch&) = delete; GuardedThreadCountSwitch& operator=(const GuardedThreadCountSwitch&) = delete; }; bool NeuralNetTracker::detect() { double inference_time = 0.; OnScopeExit update_inference_time{ [&]() { QMutexLocker lck{ &stats_mtx_ }; inference_time_ = inference_time; } }; // If there is no past ROI from the localizer or if the match of its output // with the current ROI is too poor we have to run it again. This causes a // latency spike of maybe an additional 50%. But it only occurs when the user // moves his head far enough - or when the tracking ist lost ... if (!last_localizer_roi_ || !last_roi_ || iou(*last_localizer_roi_,*last_roi_)<0.25) { auto [p, rect] = localizer_->run(grayscale_); inference_time += localizer_->last_inference_time_millis(); if (last_roi_ && iou(rect,*last_roi_)>=0.25 && p > 0.5) { // The new ROI matches the result from tracking, so the user is // still there and to not disturb recurrent models, we only update // ... last_localizer_roi_ = rect; } else if (p > 0.5 && rect.height > 32 && rect.width > 32) { // Tracking probably got lost since the ROI's don't match, but the // localizer still finds a face, so we use the ROI from the localizer last_localizer_roi_ = rect; last_roi_ = rect; } else { // Tracking lost and no localization result. The user probably can't be seen. last_roi_.reset(); last_localizer_roi_.reset(); } } if (!last_roi_) { // Last iteration the tracker failed to generate a trustworthy // roi and the localizer also cannot find a face. draw_gizmos({}, {}); return false; } auto face = poseestimator_->run(grayscale_, *last_roi_); inference_time += poseestimator_->last_inference_time_millis(); if (!face) { last_roi_.reset(); draw_gizmos({}, {}); return false; } cv::Rect2f roi = expand(face->box, (float)settings_.roi_zoom); last_roi_ = ewa_filter(*last_roi_, roi, float(settings_.roi_filter_alpha)); QuatPose pose = compute_filtered_pose(*face); last_pose_ = pose; Affine pose_affine = { pose.rot.toRotMat3x3(cv::QUAT_ASSUME_UNIT), pose.pos }; { QMutexLocker lck(&mtx_); last_pose_affine_ = pose_affine; } draw_gizmos(*face, pose_affine); return true; } void NeuralNetTracker::draw_gizmos( const std::optional &face, const Affine& pose) { if (!is_visible_) return; preview_.draw_gizmos( face, last_roi_, last_localizer_roi_, world_to_image(pose.t, grayscale_.size(), intrinsics_)); if (settings_.show_network_input) { cv::Mat netinput = poseestimator_->last_network_input(); preview_.overlay_netinput(netinput); } } QuatPose NeuralNetTracker::transform_to_world_pose(const cv::Quatf &face_rotation, const cv::Point2f& face_xy, const float face_size) const { const vec3 face_world_pos = image_to_world( face_xy.x, face_xy.y, face_size, HEAD_SIZE_MM, grayscale_.size(), intrinsics_); const cv::Quatf rot_correction = compute_rotation_correction( face_world_pos); cv::Quatf rot = rot_correction * image_to_world(face_rotation); // But this is in general not the location of the rotation joint in the neck. // So we need an extra offset. Which we determine by computing // z,y,z-pos = head_joint_loc + R_face * offset const vec3 local_offset = vec3{ static_cast(settings_.offset_fwd), static_cast(settings_.offset_up), static_cast(settings_.offset_right)}; const vec3 offset = rotate(rot, local_offset); const vec3 pos = face_world_pos + offset; return { rot, pos }; } QuatPose NeuralNetTracker::compute_filtered_pose(const PoseEstimator::Face &face) { if (fps_ > 0.01 && last_pose_ && poseestimator_->has_uncertainty()) { auto image2world = [this](const cv::Quatf &face_rotation, const cv::Point2f& face_xy, const float face_size) { return this->transform_to_world_pose(face_rotation, face_xy, face_size); }; return apply_filter( face, *last_pose_, 1./fps_, std::move(image2world), FiltParams{ float(settings_.deadzone_hardness), float(settings_.deadzone_size) }); } else { return transform_to_world_pose(face.rotation, face.center, face.size); } } NeuralNetTracker::NeuralNetTracker() { opencv_init(); neuralnet_tracker_tests::run(); } NeuralNetTracker::~NeuralNetTracker() { requestInterruption(); wait(); // fast start/stop causes breakage portable::sleep(1000); } module_status NeuralNetTracker::start_tracker(QFrame* videoframe) { videoframe->show(); video_widget_ = std::make_unique(videoframe); layout_ = std::make_unique(); layout_->setContentsMargins(0, 0, 0, 0); layout_->addWidget(&*video_widget_); videoframe->setLayout(&*layout_); video_widget_->show(); num_threads_ = settings_.num_threads; start(); return status_ok(); } bool NeuralNetTracker::load_and_initialize_model() { const QString localizer_model_path_enc = OPENTRACK_BASE_PATH+"/" OPENTRACK_LIBRARY_PATH "/models/head-localizer.onnx"; const QString poseestimator_model_path_enc = OPENTRACK_BASE_PATH+"/" OPENTRACK_LIBRARY_PATH "/models/head-pose.onnx"; try { env_ = Ort::Env{ OrtLoggingLevel::ORT_LOGGING_LEVEL_ERROR, "tracker-neuralnet" }; auto opts = Ort::SessionOptions{}; // Do thread settings here do anything? // There is a warning which says to control number of threads via // openmp settings. Which is what we do. opts.SetIntraOpNumThreads(num_threads_); opts.SetInterOpNumThreads(1); allocator_info_ = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); localizer_.emplace( allocator_info_, Ort::Session{env_, convert(localizer_model_path_enc).c_str(), opts}); poseestimator_.emplace( allocator_info_, Ort::Session{env_, convert(poseestimator_model_path_enc).c_str(), opts}); } catch (const Ort::Exception &e) { qDebug() << "Failed to initialize the neural network models. ONNX error message: " << e.what(); return false; } return true; } bool NeuralNetTracker::open_camera() { #if 0 int rint = std::clamp(*settings_.resolution, 0, (int)std::size(resolution_choices)-1); resolution_tuple res = resolution_choices[rint]; int fps = enum_to_fps(settings_.force_fps); #endif video::impl::camera::info args {}; args.width = 640; args.height = 480; args.fps = 60; args.use_mjpeg = true; QMutexLocker l(&camera_mtx_); camera_ = nullptr; const QString name = settings_.camera_name; if (name.isEmpty() || name == "TrackHat sensor") { camera_ = video::make_camera_("TrackHat sensor"); if (camera_ && camera_->start(args)) return true; if (!name.isEmpty()) return false; } camera_ = video::make_camera(name); if (!camera_) return false; #if 0 video::impl::camera::info args {}; if (res.width) { args.width = res.width; args.height = res.height; } if (fps) args.fps = fps; args.use_mjpeg = settings_.use_mjpeg; #endif if (!camera_->start(args)) { qDebug() << "neuralnet tracker: can't open camera"; return false; } return true; } void NeuralNetTracker::run() { preview_.init(*video_widget_); GuardedThreadCountSwitch switch_num_threads_to(num_threads_); if (!open_camera()) return; if (!load_and_initialize_model()) return; std::chrono::high_resolution_clock clk; while (!isInterruptionRequested()) { is_visible_ = check_is_visible(); auto t = clk.now(); { QMutexLocker l(&camera_mtx_); auto [ img, res ] = camera_->get_frame(); if (!res) { l.unlock(); portable::sleep(100); continue; } { QMutexLocker lck{&stats_mtx_}; resolution_ = { img.width, img.height }; } auto color = prepare_input_image(img); if (is_visible_) preview_.copy_video_frame(color); switch (img.channels) { case 1: grayscale_.create(img.height, img.width, CV_8UC1); color.copyTo(grayscale_); break; case 3: cv::cvtColor(color, grayscale_, cv::COLOR_BGR2GRAY); break; default: qDebug() << "Can't handle" << img.channels << "color channels"; return; } } intrinsics_ = make_intrinsics(grayscale_, settings_); detect(); if (is_visible_) preview_.copy_to_widget(*video_widget_); update_fps( std::chrono::duration_cast( clk.now() - t).count()*1.e-3); } camera_ = nullptr; } cv::Mat NeuralNetTracker::prepare_input_image(const video::frame& frame) { auto img = cv::Mat(frame.height, frame.width, CV_8UC(frame.channels), (void*)frame.data, frame.stride); // Crop if aspect ratio is not 4:3 if (img.rows*4 != img.cols*3) { img = img(make_crop_rect_for_aspect(img.size(), 4, 3)); } img = img(make_crop_rect_multiple_of(img.size(), 4)); if (img.cols > 640) { cv::pyrDown(img, downsized_original_images_[0]); img = downsized_original_images_[0]; } if (img.cols > 640) { cv::pyrDown(img, downsized_original_images_[1]); img = downsized_original_images_[1]; } return img; } void NeuralNetTracker::update_fps(double dt) { const double alpha = dt/(dt + RC); if (dt > 1e-6) { QMutexLocker lck{&stats_mtx_}; fps_ *= 1 - alpha; fps_ += alpha * 1./dt; } } void NeuralNetTracker::data(double *data) { auto tmp2 = [&]() { QMutexLocker lck(&mtx_); return last_pose_affine_; }(); if (!tmp2) return; const auto& tmp = *tmp2; const auto& mx = tmp.R.col(0); const auto& my = tmp.R.col(1); const auto& mz = -tmp.R.col(2); const float yaw = std::atan2(mx(2), mx(0)); const float pitch = -std::atan2(-mx(1), std::sqrt(mx(2)*mx(2)+mx(0)*mx(0))); const float roll = -std::atan2(-my(2), mz(2)); { constexpr double rad2deg = 180/M_PI; data[Yaw] = rad2deg * yaw; data[Pitch] = rad2deg * pitch; data[Roll] = rad2deg * roll; // convert to cm data[TX] = -tmp.t[2] * 0.1; data[TY] = tmp.t[1] * 0.1; data[TZ] = -tmp.t[0] * 0.1; } } Affine NeuralNetTracker::pose() { QMutexLocker lck(&mtx_); return last_pose_affine_ ? *last_pose_affine_ : Affine{}; } std::tuple NeuralNetTracker::stats() const { QMutexLocker lck(&stats_mtx_); return { resolution_, fps_, inference_time_ }; } void NeuralNetDialog::make_fps_combobox() { #if 0 for (int k = 0; k < fps_MAX; k++) { const int hz = enum_to_fps(k); const QString name = (hz == 0) ? tr("Default") : QString::number(hz); ui_.cameraFPS->addItem(name, k); } #endif } void NeuralNetDialog::make_resolution_combobox() { #if 0 int k=0; for (const auto [w, h] : resolution_choices) { const QString s = (w == 0) ? tr("Default") : QString::number(w) + " x " + QString::number(h); ui_.resolution->addItem(s, k++); } #endif } NeuralNetDialog::NeuralNetDialog() : trans_calib_(1, 2) { ui_.setupUi(this); make_fps_combobox(); make_resolution_combobox(); ui_.cameraName->addItem(QString{}); for (const auto& str : video::camera_names()) ui_.cameraName->addItem(str); tie_setting(settings_.camera_name, ui_.cameraName); #if 0 tie_setting(settings_.fov, ui_.cameraFOV); #endif tie_setting(settings_.offset_fwd, ui_.tx_spin); tie_setting(settings_.offset_up, ui_.ty_spin); tie_setting(settings_.offset_right, ui_.tz_spin); #if 0 tie_setting(settings_.show_network_input, ui_.showNetworkInput); tie_setting(settings_.roi_filter_alpha, ui_.roiFilterAlpha); tie_setting(settings_.use_mjpeg, ui_.use_mjpeg); tie_setting(settings_.roi_zoom, ui_.roiZoom); tie_setting(settings_.num_threads, ui_.threadCount); tie_setting(settings_.resolution, ui_.resolution); tie_setting(settings_.force_fps, ui_.cameraFPS); #endif { const struct { QString label; exposure_preset preset; } presets[] = { { QStringLiteral("Near (1-3ft)"), exposure_preset::near }, { QStringLiteral("Medium (3-6ft)"), exposure_preset::medium }, { QStringLiteral("Far (6+ft)"), exposure_preset::far }, { QStringLiteral("Custom"), exposure_preset::ignored }, }; for (const auto& [label, preset] : presets) ui_.exposure_preset->addItem(label, int(preset)); tie_setting(cs_.exposure, ui_.exposure_preset); } connect(ui_.buttonBox, SIGNAL(accepted()), this, SLOT(doOK())); connect(ui_.buttonBox, SIGNAL(rejected()), this, SLOT(doCancel())); connect(ui_.camera_settings, SIGNAL(clicked()), this, SLOT(camera_settings())); connect(&settings_.camera_name, value_::value_changed(), this, &NeuralNetDialog::update_camera_settings_state); update_camera_settings_state(settings_.camera_name); connect(&calib_timer_, &QTimer::timeout, this, &NeuralNetDialog::trans_calib_step); calib_timer_.setInterval(35); connect(ui_.tcalib_button,SIGNAL(toggled(bool)), this, SLOT(startstop_trans_calib(bool))); connect(&tracker_status_poll_timer_, &QTimer::timeout, this, &NeuralNetDialog::status_poll); tracker_status_poll_timer_.setInterval(250); tracker_status_poll_timer_.start(); } void NeuralNetDialog::save() { settings_.b->save(); cs_.b->save(); } void NeuralNetDialog::reload() { settings_.b->reload(); cs_.b->reload(); } void NeuralNetDialog::doOK() { save(); close(); } void NeuralNetDialog::doCancel() { close(); } void NeuralNetDialog::camera_settings() { if (tracker_) { QMutexLocker l(&tracker_->camera_mtx_); (void)tracker_->camera_->show_dialog(); } else (void)video::show_dialog(settings_.camera_name); } void NeuralNetDialog::update_camera_settings_state(const QString& name) { (void)name; ui_.camera_settings->setEnabled(true); } void NeuralNetDialog::register_tracker(ITracker * x) { tracker_ = static_cast(x); ui_.tcalib_button->setEnabled(true); } void NeuralNetDialog::unregister_tracker() { tracker_ = nullptr; ui_.tcalib_button->setEnabled(false); } bool NeuralNetDialog::embeddable() noexcept { return true; } void NeuralNetDialog::set_buttons_visible(bool x) { ui_.buttonBox->setVisible(x); } void NeuralNetDialog::status_poll() { QString status; if (!tracker_) { status = tr("Tracker Offline"); } else { auto [ res, fps, inference_time ] = tracker_->stats(); status = tr("%1x%2 @ %3 FPS / Inference: %4 ms").arg(res.width).arg(res.height).arg(int(fps)).arg(int(inference_time)); } ui_.resolution_display->setText(status); } void NeuralNetDialog::trans_calib_step() { if (tracker_) { const Affine X_CM = [&]() { QMutexLocker l(&calibrator_mutex_); return tracker_->pose(); }(); trans_calib_.update(X_CM.R, X_CM.t); auto [_, nsamples] = trans_calib_.get_estimate(); constexpr int min_yaw_samples = 15; constexpr int min_pitch_samples = 12; constexpr int min_samples = min_yaw_samples+min_pitch_samples; // Don't bother counting roll samples. Roll calibration is hard enough // that it's a hidden unsupported feature anyway. QString sample_feedback; if (nsamples[0] < min_yaw_samples) sample_feedback = tr("%1 yaw samples. Yaw more to %2 samples for stable calibration.").arg(nsamples[0]).arg(min_yaw_samples); else if (nsamples[1] < min_pitch_samples) sample_feedback = tr("%1 pitch samples. Pitch more to %2 samples for stable calibration.").arg(nsamples[1]).arg(min_pitch_samples); else { const int nsamples_total = nsamples[0] + nsamples[1]; sample_feedback = tr("%1 samples. Over %2, good!").arg(nsamples_total).arg(min_samples); } ui_.sample_count_display->setText(sample_feedback); } else startstop_trans_calib(false); } void NeuralNetDialog::startstop_trans_calib(bool start) { QMutexLocker l(&calibrator_mutex_); // FIXME: does not work ... if (start) { qDebug() << "pt: starting translation calibration"; calib_timer_.start(); trans_calib_.reset(); ui_.sample_count_display->setText(QString()); // Tracker must run with zero'ed offset for calibration. settings_.offset_fwd = 0; settings_.offset_up = 0; settings_.offset_right = 0; } else { calib_timer_.stop(); qDebug() << "pt: stopping translation calibration"; { auto [tmp, nsamples] = trans_calib_.get_estimate(); settings_.offset_fwd = int(tmp[0]); settings_.offset_up = int(tmp[1]); settings_.offset_right = int(tmp[2]); } } ui_.tx_spin->setEnabled(!start); ui_.ty_spin->setEnabled(!start); ui_.tz_spin->setEnabled(!start); if (start) ui_.tcalib_button->setText(tr("Stop calibration")); else ui_.tcalib_button->setText(tr("Start calibration")); } Settings::Settings() : opts("neuralnet-tracker") {} } // neuralnet_tracker_ns OPENTRACK_DECLARE_TRACKER(NeuralNetTracker, NeuralNetDialog, NeuralNetMetadata)