diff options
Diffstat (limited to 'eigen/unsupported')
96 files changed, 1376 insertions, 7343 deletions
diff --git a/eigen/unsupported/CMakeLists.txt b/eigen/unsupported/CMakeLists.txt index 9a56661..4fef40a 100644 --- a/eigen/unsupported/CMakeLists.txt +++ b/eigen/unsupported/CMakeLists.txt @@ -1,9 +1,7 @@ add_subdirectory(Eigen) add_subdirectory(doc EXCLUDE_FROM_ALL) -if(BUILD_TESTING) - if(EIGEN_LEAVE_TEST_IN_ALL_TARGET) - add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest - else() - add_subdirectory(test EXCLUDE_FROM_ALL) - endif() +if(EIGEN_LEAVE_TEST_IN_ALL_TARGET) + add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest +else() + add_subdirectory(test EXCLUDE_FROM_ALL) endif() diff --git a/eigen/unsupported/Eigen/CXX11/Tensor b/eigen/unsupported/Eigen/CXX11/Tensor index 3991609..7ecb4c7 100644 --- a/eigen/unsupported/Eigen/CXX11/Tensor +++ b/eigen/unsupported/Eigen/CXX11/Tensor @@ -13,14 +13,13 @@ #include "../../../Eigen/Core" -#if defined(EIGEN_USE_SYCL) +#ifdef EIGEN_USE_SYCL #undef min #undef max #undef isnan #undef isinf #undef isfinite #include <SYCL/sycl.hpp> -#include <iostream> #include <map> #include <memory> #include <utility> @@ -53,10 +52,8 @@ typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; -#include <windows.h> #else #include <stdint.h> -#include <unistd.h> #endif #if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900 @@ -71,10 +68,6 @@ typedef unsigned __int64 uint64_t; #include <time.h> #endif -#if defined(EIGEN_USE_LIBXSMM) -#include "libxsmm.h" -#endif - #ifdef EIGEN_USE_THREADS #include "ThreadPool" #endif diff --git a/eigen/unsupported/Eigen/CXX11/ThreadPool b/eigen/unsupported/Eigen/CXX11/ThreadPool index c346141..09d637e 100644 --- a/eigen/unsupported/Eigen/CXX11/ThreadPool +++ b/eigen/unsupported/Eigen/CXX11/ThreadPool @@ -50,7 +50,6 @@ #include "src/ThreadPool/ThreadLocal.h" #include "src/ThreadPool/ThreadYield.h" -#include "src/ThreadPool/ThreadCancel.h" #include "src/ThreadPool/EventCount.h" #include "src/ThreadPool/RunQueue.h" #include "src/ThreadPool/ThreadPoolInterface.h" @@ -58,18 +57,6 @@ #include "src/ThreadPool/SimpleThreadPool.h" #include "src/ThreadPool/NonBlockingThreadPool.h" - -// Use the more efficient NonBlockingThreadPool by default. -namespace Eigen { -#ifndef EIGEN_USE_SIMPLE_THREAD_POOL -template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>; -typedef NonBlockingThreadPool ThreadPool; -#else -template <typename Env> using ThreadPoolTempl = SimpleThreadPoolTempl<Env>; -typedef SimpleThreadPool ThreadPool; -#endif -} // namespace Eigen - #endif #include <Eigen/src/Core/util/ReenableStupidWarnings.h> diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md b/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md index 38cdb9c..98e8381 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md @@ -1737,9 +1737,11 @@ TODO ## Representation of scalar values -Scalar values are often represented by tensors of size 1 and rank 0.For example -Tensor<T, N>::maximum() currently returns a Tensor<T, 0>. Similarly, the inner -product of 2 1d tensors (through contractions) returns a 0d tensor. +Scalar values are often represented by tensors of size 1 and rank 1. It would be +more logical and user friendly to use tensors of rank 0 instead. For example +Tensor<T, N>::maximum() currently returns a Tensor<T, 1>. Similarly, the inner +product of 2 1d tensors (through contractions) returns a 1d tensor. In the +future these operations might be updated to return 0d tensors instead. ## Limitations diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index fbe3408..7a45a5c 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -186,12 +186,6 @@ class TensorBase<Derived, ReadOnlyAccessors> } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_expm1_op<Scalar>, const Derived> - expm1() const { - return unaryExpr(internal::scalar_expm1_op<Scalar>()); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived> log() const { return unaryExpr(internal::scalar_log_op<Scalar>()); diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 23a7446..4cfe300 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -54,7 +54,7 @@ struct is_input_scalar<Sizes<> > { static const bool value = true; }; #ifndef EIGEN_EMULATE_CXX11_META_H -template <typename std::ptrdiff_t... Indices> +template <typename std::size_t... Indices> struct is_input_scalar<Sizes<Indices...> > { static const bool value = (Sizes<Indices...>::total_size == 1); }; diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index c46a778..1ba7ef1 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -50,7 +50,6 @@ template <DenseIndex DimId> struct DimensionId { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) { - EIGEN_UNUSED_VARIABLE(dim); eigen_assert(dim == DimId); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { @@ -151,7 +150,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device), m_offset(op.offset()) + : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device) { EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); eigen_assert(NumInputDims > m_dim.actualDim()); @@ -207,7 +206,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) || - (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { + (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(m_stride == 1); Index inputIndex = index * m_inputStride + m_inputOffset; @@ -219,7 +218,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> PacketReturnType rslt = internal::pload<PacketReturnType>(values); return rslt; } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims - 1) || - (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) { + (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) { // m_stride is aways greater than index, so let's avoid the integer division. eigen_assert(m_stride > index); return m_impl.template packet<LoadMode>(index + m_inputOffset); @@ -275,29 +274,17 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> } } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex dimId() const { - return m_dim.actualDim(); - } - - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DenseIndex& offset() const { - return m_offset; - } - /// required by sycl in order to extract the accessor - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } - protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex; if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) || - (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { + (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(m_stride == 1); inputIndex = index * m_inputStride + m_inputOffset; } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims-1) || - (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) { + (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) { // m_stride is aways greater than index, so let's avoid the integer division. eigen_assert(m_stride > index); inputIndex = index + m_inputOffset; @@ -317,9 +304,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> TensorEvaluator<ArgType, Device> m_impl; const internal::DimensionId<DimId> m_dim; const Device& m_device; -// required by sycl - const DenseIndex m_offset; - }; @@ -360,7 +344,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device> EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == 0) || - (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) { + (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(this->m_stride == 1); EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; @@ -371,7 +355,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device> inputIndex += this->m_inputStride; } } else if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == NumInputDims-1) || - (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == 0)) { + (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == 0)) { // m_stride is aways greater than index, so let's avoid the integer division. eigen_assert(this->m_stride > index); this->m_impl.template writePacket<StoreMode>(index + this->m_inputOffset, x); diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 2c7ba96..59bf90d 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -276,12 +276,6 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy } EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - /// required by sycl in order to extract the accessor - const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator<RightArgType, Device>& right_impl() const { return m_rightImpl; } - /// required by sycl in order to extract the accessor - const Axis& axis() const { return m_axis; } protected: Dimensions m_dimensions; diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index bf4a476..20b29e5 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -20,70 +20,6 @@ namespace Eigen { * */ namespace internal { -#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) -template<typename Scalar, typename Index> -void pack_simple(Scalar * dst, const Scalar * src, Index cols, Index rows, Index lddst, Index ldsrc) { - size_t psize = packet_traits<Scalar>::size; // Packet size - typedef typename packet_traits<Scalar>::type Packet; // Packet type - size_t alignment = psize*sizeof(Scalar); // Needed alignment - if (rows % psize == 0 && (lddst*sizeof(Scalar)) % alignment == 0 && - (ldsrc*sizeof(Scalar)) % alignment == 0 && - reinterpret_cast<uintptr_t>(src) % alignment == 0 && - reinterpret_cast<uintptr_t>(dst) % alignment == 0) { - // Optimized version using packets - size_t num_packets = rows / psize; - for (Index col = 0; col < cols; ++col) { - EIGEN_ASM_COMMENT("begin pack_simple inner copy"); - // Unrolled manually 4 times. - for (size_t i=0; i < num_packets/4; ++i) { - internal::pstore(dst, internal::pload<Packet>(src)); - dst += psize; src += psize; - internal::pstore(dst, internal::pload<Packet>(src)); - dst += psize; src += psize; - internal::pstore(dst, internal::pload<Packet>(src)); - dst += psize; src += psize; - internal::pstore(dst, internal::pload<Packet>(src)); - dst += psize; src += psize; - } - for (size_t i=0; i < num_packets%4; ++i) { - internal::pstore(dst, internal::pload<Packet>(src)); - dst += psize; src += psize; - } - dst += lddst - num_packets*psize; - src += ldsrc - num_packets*psize; - EIGEN_ASM_COMMENT("end pack_simple inner copy"); - } - } else { - // Naive memcpy calls - for (Index col = 0; col < cols; ++col) { - memcpy(dst + col*lddst, src + col*ldsrc, rows*sizeof(Scalar)); - } - } -} - -template<typename LhsScalar, typename RhsScalar, typename Scalar> - struct libxsmm_wrapper { - libxsmm_wrapper() {} - libxsmm_wrapper(int, int, int, int, int, int, int, float, float, int) {} - void operator()(const LhsScalar*, const RhsScalar*, Scalar*) {} - void operator()(const LhsScalar*, const RhsScalar*, Scalar*, const LhsScalar*, const RhsScalar*, const Scalar*) {} - }; - - template<> - struct libxsmm_wrapper<float, float, float>: public libxsmm_mmfunction<float> { - libxsmm_wrapper(): libxsmm_mmfunction() {} - libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) : - libxsmm_mmfunction(flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch) {} - }; - - template<> - struct libxsmm_wrapper<double, double, double>: public libxsmm_mmfunction<double> { - libxsmm_wrapper(): libxsmm_mmfunction() {} - libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) : - libxsmm_mmfunction(flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch) {} - }; -#endif - template<typename Dimensions, typename LhsXprType, typename RhsXprType> struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> > @@ -222,7 +158,7 @@ struct TensorContractionEvaluatorBase m_device(device), m_result(NULL) { EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == - static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), + static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -381,8 +317,6 @@ struct TensorContractionEvaluatorBase } } - EnableXSMMIfPossible(eval_op_indices); - // If the layout is RowMajor, we need to reverse the m_dimensions if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) { for (int i = 0, j = NumDims - 1; i < j; i++, j--) { @@ -393,7 +327,7 @@ struct TensorContractionEvaluatorBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar * data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { m_leftImpl.evalSubExprsIfNeeded(NULL); m_rightImpl.evalSubExprsIfNeeded(NULL); if (data) { @@ -488,13 +422,6 @@ struct TensorContractionEvaluatorBase template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { - #if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) - if (m_can_use_xsmm) { - evalGemmXSMM(buffer); - return; - } - #endif - // columns in left side, rows in right side const Index k = this->m_k_size; @@ -611,214 +538,7 @@ struct TensorContractionEvaluatorBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; } -protected: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void EnableXSMMIfPossible(const array<IndexPair<Index>, ContractDims>& eval_op_indices) { - m_can_use_xsmm = false; - -#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) - typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; - typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar; - if (!std::is_same<Scalar, LhsScalar>::value || - !std::is_same<Scalar, RhsScalar>::value || - !(std::is_same<Scalar, float>::value || - std::is_same<Scalar, double>::value) || - m_leftImpl.data() == NULL || - m_rightImpl.data() == NULL) { - return; - } - - // Check if we can use faster matmul algorithms. For contraction to be - // equivalent to matmul, we need both lhs and rhs contracting dims sequences - // to be either a prefix or suffix of all dims. Also, the order of both - // must be the same, so we don't have to do reordering. - // For example: - // * OK: lhs 4D, rhs 4D, contraction: [(0, 2), (1, 3)] - // * BAD: lhs 3D, rhs 3D, contraction: [(1,1)] - // * BAD: lhs 3D, rhs 3D, contraction: [(0, 0), (2, 2)] - // * BAD: lhs 3D, rhs 3D, contraction: [(0, 2), (1, 1)] - // Depending if contraction dims are prefix or suffix of all dims we need to - // pre-transpose matrices in matmul algorithm: - // lhs: prefix -> transpose, suffix -> no transpose - // rhs: prefix -> no transpose, suffix -> transpose - // For example, for lhs 2D, rhs 2D, contraction [(1, 0)] is regular, - // non-transposed matmul. - if (ContractDims == 0) { - // This case is totally uninteresting, filter it out to avoid problems - // with iterations in further tests. - return; - } - - // Check if RHS dims list is increasing. LHS already is, so if not, the - // order is different and we cannot do matmul. - for (int i = 1; i < ContractDims; i++) { - if (eval_op_indices[i].second < eval_op_indices[i-1].second) { - return; - } - } - - // Check if no holes. - int diff; - for (int i = 1; i < ContractDims; i++) { - // LHS contract dims are sorted to form an increasing seq. - diff = eval_op_indices[i].first - eval_op_indices[i-1].first; - if (diff != 1) { - return; - } - // Now we may already assume RHS contract dims seq is increasing too. - diff = eval_op_indices[i].second - eval_op_indices[i-1].second; - if (diff != 1) { - return; - } - } - - // Check if suffix or prefix. - if (eval_op_indices[0].first != 0 && - eval_op_indices[ContractDims-1].first != LDims-1) { - return; - } - if (eval_op_indices[0].second != 0 && - eval_op_indices[ContractDims-1].second != RDims-1) { - return; - } - - m_can_use_xsmm = true; -#else - EIGEN_UNUSED_VARIABLE(eval_op_indices); -#endif - } - -#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) - EIGEN_DEVICE_FUNC void evalGemmXSMM(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; - - // rows in left side - const Index m = this->m_i_size; - - // columns in right side - const Index n = this->m_j_size; - - const bool transposeA = !m_lhs_inner_dim_contiguous; - const bool transposeB = !m_rhs_inner_dim_contiguous; - - typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; - typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar; - - internal::TensorXsmmContractionBlocking<LhsScalar, RhsScalar, Index> blocking( - k, m, n, 1, transposeA, transposeB); - - // Outer blocks sizes - const Index mc_outer = blocking.outer_m(); - const Index nc_outer = blocking.outer_n(); - const Index kc_outer = blocking.outer_k(); - // Inner blocks sizes - const Index mc = blocking.mc(); - const Index nc = blocking.nc(); - const Index kc = blocking.kc(); - // Decisions whether we should copy parts of matrices - const bool copyA = blocking.copyA(); - const bool copyB = blocking.copyB(); - - const LhsScalar* leftData = m_leftImpl.data(); - const RhsScalar* rightData = m_rightImpl.data(); - - const libxsmm_blasint stride_A = static_cast<libxsmm_blasint>(transposeA ? k : m); - const libxsmm_blasint stride_B = static_cast<libxsmm_blasint>(transposeB ? n : k); - const libxsmm_blasint stride_C = static_cast<libxsmm_blasint>(m); - - const libxsmm_blasint stride_blockA = static_cast<libxsmm_blasint>(mc); - // Use bigger stride to avoid hitting same cache line too often. - // This consistently gives +~0.5 Gflops. - const libxsmm_blasint stride_panelB = static_cast<libxsmm_blasint>( - kc % 32 == 0 ? kc + 16 : kc - ); - - // Kernel for the general case (not edges) - internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar> kernel; - - LhsScalar* blockA = NULL; - RhsScalar* panelB = NULL; - - if (copyA) { - blockA = static_cast<LhsScalar*>(this->m_device.allocate(mc * kc * sizeof(LhsScalar))); - } - if (copyB) { - panelB = static_cast<RhsScalar*>(this->m_device.allocate(nc_outer * stride_panelB * sizeof(RhsScalar))); - } - - const Index kernel_stride_A = copyA ? stride_blockA : stride_A; - const Index kernel_stride_B = copyB ? stride_panelB : stride_B; - kernel = internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(0, mc, nc, kc, kernel_stride_A, kernel_stride_B, stride_C, 1, 1, blocking.prefetch()); - - // Outer blocking - for (Index ki_outer = 0; ki_outer < k; ki_outer += kc_outer) { - for (Index mi_outer = 0; mi_outer < m; mi_outer += mc_outer) { - for (Index ni_outer = 0; ni_outer < n; ni_outer += nc_outer) { - using numext::mini; - - Index actual_nc_outer = mini(ni_outer+nc_outer, n) - ni_outer; - - // Inner blocking - for (Index ki = ki_outer; ki < mini(ki_outer+kc_outer, k); ki += kc) { - const Index actual_kc = mini(ki_outer+kc_outer, mini(ki+kc, k)) - ki; - const float beta = ki == 0 ? 0 : 1; - - if (copyB) { - if (transposeB) { - libxsmm_otrans(panelB, rightData + ki*stride_B + ni_outer, sizeof(RhsScalar), actual_nc_outer, actual_kc, stride_B, stride_panelB); - } else { - internal::pack_simple<RhsScalar, Index>(panelB, rightData + ni_outer*stride_B + ki, actual_nc_outer, actual_kc, stride_panelB, stride_B); - } - } - - for (Index mi = mi_outer; mi < mini(mi_outer+mc_outer, m); mi += mc) { - const Index actual_mc = mini(mi_outer+mc_outer, mini(mi+mc, m)) - mi; - - const LhsScalar* a = transposeA ? leftData + mi*stride_A + ki : - leftData + ki*stride_A + mi; - - if (copyA) { - if (transposeA) { - libxsmm_otrans(blockA, a, sizeof(LhsScalar), actual_kc, actual_mc, stride_A, stride_blockA); - } else { - internal::pack_simple<LhsScalar, Index>(blockA, a, actual_kc, actual_mc, stride_blockA, stride_A); - } - } - const LhsScalar* actual_a = copyA ? blockA : a; - - for (Index ni = ni_outer; ni < mini(ni_outer+nc_outer, n); ni += nc) { - const Index actual_nc = mini(ni_outer+nc_outer, mini(ni+nc, n)) - ni; - - const RhsScalar* b = rightData + ni*stride_B + ki; - Scalar* c = buffer + ni*stride_C + mi; - const Scalar* cp = c + nc*stride_C; - - const RhsScalar* actual_b = copyB ? panelB + (ni-ni_outer)*stride_panelB : b; - const RhsScalar* bp = copyB ? panelB + nc*stride_panelB : b + nc*stride_B; - - if (actual_mc == mc && actual_kc == kc && actual_nc == nc && beta == 1) { - // Most used, cached kernel. - kernel(actual_a, actual_b, c, actual_a, bp, cp); - } else { - // Edges - use libxsmm kernel cache. - internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(0, actual_mc, actual_nc, actual_kc, kernel_stride_A, kernel_stride_B, stride_C, 1, beta, blocking.prefetch())(actual_a, actual_b, c, actual_a, bp, cp); - } - } - } - } - } - } - } - - if (copyA) { - this->m_device.deallocate(blockA); - } - if (copyB) { - this->m_device.deallocate(panelB); - } - } -#endif - + protected: // Prevent assignment TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&); Dimensions m_dimensions; @@ -844,7 +564,6 @@ protected: TensorEvaluator<EvalRightArgType, Device> m_rightImpl; const Device& m_device; Scalar* m_result; - bool m_can_use_xsmm; }; diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h index d34f9ca..5cf7b4f 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -50,140 +50,6 @@ class TensorContractionBlocking { }; - -#if defined(EIGEN_USE_LIBXSMM) -template <typename LhsScalar, typename RhsScalar, typename Index> -class TensorXsmmContractionBlocking { - public: - TensorXsmmContractionBlocking(Index k, Index m, Index n, - size_t max_num_threads = 1, bool transposeA = false, - bool transposeB = false): - k_(k), m_(m), n_(n), transposeA_(transposeA), - transposeB_(transposeB), num_threads_(max_num_threads) { -#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES - if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) { - mc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M; - kc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K; - nc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N; - outer_m_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_M; - outer_k_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_K; - outer_n_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_N; - copyA_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_A; - copyB_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_B; - outer_m_ = outer_m_ != 0 ? outer_m_ : m; - outer_k_ = outer_k_ != 0 ? outer_k_ : k; - outer_n_ = outer_n_ != 0 ? outer_n_ : n; - } -#else - // Defaults, possibly overriden per-platform. - copyA_ = true; - copyB_ = false; - - // If the matrix is small enough, don't do blocking, just call single xsmm - // kernel. - if (static_cast<double>(m)*k*n <= LIBXSMM_THRESHOLD) { - mc_ = m; kc_ = k; nc_ = n; - outer_m_ = m; outer_k_ = k; outer_n_ = n; - copyA_ = false; copyB_ = false; - } else { - int arch = libxsmm_cpuid_x86(); - - if (arch == LIBXSMM_X86_AVX512_CORE) { - // skylake - mc_ = 64; kc_ = 64; nc_ = 24; - outer_m_ = 512; outer_k_ = 512; outer_n_ = 24*22; - // Hack to use this kernel architecture as the other one has performance - // issues (no hardware prefetching). - // TODO(nishantpatil): This should be removed if the issues are fixed, - // or this one becomes the default. - setenv("LIBXSMM_AVX512_CLASSIC_GEMM", "1", 1); - } else if (arch == LIBXSMM_X86_AVX2) { - // haswell - mc_ = 32; kc_ = 192; nc_ = 33; - outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 33*16; - } else if (arch == LIBXSMM_X86_AVX) { - // ivybridge - mc_ = 32; kc_ = 192; nc_ = 48; - outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 48*11; - } else { - // generic kernel size, usually performing well - mc_ = 32; kc_ = 128; nc_ = 32; - outer_m_ = 512; outer_k_ = 512; outer_n_ = 512; - } - - // Only copy if it makes the stride smaller. - copyA_ = copyA_ && (m > mc_); - copyB_ = copyB_ && (k > kc_); - } - - // We need to copy anyway if transposing - copyA_ = copyA_ || transposeA; - copyB_ = copyB_ || transposeB; - - // See libxsmm_gemm_prefetch_type definition in libxsmm_typedefs.h - prefetch_ = LIBXSMM_PREFETCH_AL2CL2BL2_VIA_C; - -#endif - - mc_ = mc_ > m ? m : mc_; - nc_ = nc_ > n ? n : nc_; - kc_ = kc_ > k ? k : kc_; - - size_t compute_parallelism = (m / mc_) * (n / nc_); - size_t pack_parallelism = 0; - if (copyA_) { - pack_parallelism += (m / mc_) * (k / kc_); - } - if (copyB_) { - pack_parallelism += (n / nc_) * (k / kc_); - } - size_t parallelism = numext::maxi(compute_parallelism, pack_parallelism); - - num_threads_ = numext::mini<size_t>(num_threads_, - parallelism / MIN_JOBS_PER_THREAD); - num_threads_ = numext::maxi<size_t>(num_threads_, 1); - - // For optimal performance outer block sizes should be multiplies of kernel - // sizes, or bigger than matrix size (=no outer blocking). - eigen_assert(outer_m_ % mc_ == 0 || outer_m_ >= m); - eigen_assert(outer_k_ % kc_ == 0 || outer_k_ >= k); - eigen_assert(outer_n_ % nc_ == 0 || outer_n_ >= n); - } - - EIGEN_ALWAYS_INLINE Index kc() const { return kc_; } - EIGEN_ALWAYS_INLINE Index mc() const { return mc_; } - EIGEN_ALWAYS_INLINE Index nc() const { return nc_; } - EIGEN_ALWAYS_INLINE Index outer_k() const { return outer_k_; } - EIGEN_ALWAYS_INLINE Index outer_m() const { return outer_m_; } - EIGEN_ALWAYS_INLINE Index outer_n() const { return outer_n_; } - EIGEN_ALWAYS_INLINE bool copyA() const { return copyA_; } - EIGEN_ALWAYS_INLINE bool copyB() const { return copyB_; } - EIGEN_ALWAYS_INLINE bool transposeA() const { return transposeA_; } - EIGEN_ALWAYS_INLINE bool transposeB() const { return transposeB_; } - EIGEN_ALWAYS_INLINE int num_threads() const { return num_threads_; } - EIGEN_ALWAYS_INLINE Index blocks_m() const { return divup(m_, mc_); } - EIGEN_ALWAYS_INLINE Index blocks_k() const { return divup(k_, kc_); } - EIGEN_ALWAYS_INLINE Index blocks_n() const { return divup(n_, nc_); } - EIGEN_ALWAYS_INLINE libxsmm_gemm_prefetch_type prefetch() const { - return prefetch_; - } - - private: - Index k_, m_, n_; - Index kc_, mc_, nc_; - Index outer_k_, outer_m_, outer_n_; - bool copyA_, copyB_, transposeA_, transposeB_; - size_t num_threads_; - - // Threshold for m*k*n to skip blocking and just call libxsmm - const double LIBXSMM_THRESHOLD = 80*80*80; - // For computing optimal number of threads - so that each thread gets at least - // that many jobs. - const double MIN_JOBS_PER_THREAD = 3; - libxsmm_gemm_prefetch_type prefetch_; -}; -#endif // EIGEN_USE_LIBXSMM - } // end namespace internal } // end namespace Eigen diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index c04b784..d65dbb4 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -529,6 +529,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh float2 rhs_shmem2[][8], const Index m_size, const Index n_size, const Index k_size, const Index base_m, const Index base_n) { + typedef float Scalar; // prefetch registers float4 lhs_pf0, rhs_pf0; @@ -539,27 +540,27 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh } -#define prefetch_lhs(reg, row, col) \ - if (!CHECK_LHS_BOUNDARY) { \ - if (col < k_size) { \ - reg =lhs.template loadPacket<Unaligned>(row, col); \ - } \ - } else { \ - if (col < k_size) { \ - if (row + 3 < m_size) { \ - reg =lhs.template loadPacket<Unaligned>(row, col); \ - } else if (row + 2 < m_size) { \ - reg.x =lhs(row + 0, col); \ - reg.y =lhs(row + 1, col); \ - reg.z =lhs(row + 2, col); \ - } else if (row + 1 < m_size) { \ - reg.x =lhs(row + 0, col); \ - reg.y =lhs(row + 1, col); \ - } else if (row < m_size) { \ - reg.x =lhs(row + 0, col); \ - } \ - } \ - } \ +#define prefetch_lhs(reg, row, col) \ + if (!CHECK_LHS_BOUNDARY) { \ + if (col < k_size) { \ + reg =lhs.loadPacket<Unaligned>(row, col); \ + } \ + } else { \ + if (col < k_size) { \ + if (row + 3 < m_size) { \ + reg =lhs.loadPacket<Unaligned>(row, col); \ + } else if (row + 2 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + reg.z =lhs(row + 2, col); \ + } else if (row + 1 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + } else if (row < m_size) { \ + reg.x =lhs(row + 0, col); \ + } \ + } \ + } \ Index lhs_vert = base_m+threadIdx.x*4; @@ -577,7 +578,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh if (!CHECK_RHS_BOUNDARY) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0); + rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0); } else if (rhs_vert + 2 < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -592,7 +593,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh } else { if (rhs_horiz0 < n_size) { if ((rhs_vert + 3) < k_size) { - rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0); + rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0); } else if ((rhs_vert + 2) < k_size) { rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); @@ -765,6 +766,7 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, float2 rhs_shmem2[][8], const Index m_size, const Index n_size, const Index k_size, const Index base_m, const Index base_n) { + typedef float Scalar; // prefetch registers float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3; @@ -788,37 +790,37 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, if (!CHECK_LHS_BOUNDARY) { if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16)); - lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24)); + lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24)); } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16)); } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); + lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); } } else { // just CHECK_LHS_BOUNDARY if (lhs_vert + 3 < m_size) { if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16)); - lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24)); + lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24)); } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16)); } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8)); } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); + lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k)); } } else if (lhs_vert + 2 < m_size) { if ((threadIdx.y/4+k+24) < k_size) { @@ -907,8 +909,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, if (!CHECK_RHS_BOUNDARY) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0); - rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1); + rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1); } else if (rhs_vert + 2 < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -930,8 +932,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, if (rhs_horiz1 < n_size) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0); - rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1); + rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1); } else if (rhs_vert + 2 < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -952,7 +954,7 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, } else if (rhs_horiz0 < n_size) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0); + rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0); } else if ((rhs_vert + 2) < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -1135,6 +1137,9 @@ EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, typedef float2 LHS_MEM[64][32]; typedef float2 RHS_MEM[128][8]; + typedef float2 LHS_MEM16x16[32][16]; + typedef float2 RHS_MEM16x16[64][8]; + const Index m_block_idx = blockIdx.x; const Index n_block_idx = blockIdx.y; diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h index ab320a5..9b2cb3f 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -22,14 +22,8 @@ enum { /* * Implementation of the Eigen blas_data_mapper class for tensors. */ -/// The make pointer class is used by sycl in order to build the mapper class on the device. For other platform the default make pointer is used which -/// is scalar * for CoeffLoader. -template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_ = MakePointer> struct CoeffLoader; -template<typename Scalar, typename Index, int side, typename Tensor, typename nocontract_t, typename contract_t, - int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, - template <class> class MakePointer_ = MakePointer> class BaseTensorContractionMapper; - -template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_> struct CoeffLoader { + +template <typename Tensor, bool HasRawAccess> struct CoeffLoader { enum { DirectOffsets = false }; @@ -53,7 +47,7 @@ template <typename Tensor, bool HasRawAccess, template <class> class MakePointer const Tensor m_tensor; }; -template <typename Tensor, template <class> class MakePointer_> struct CoeffLoader<Tensor, true, MakePointer_> { +template <typename Tensor> struct CoeffLoader<Tensor, true> { enum { DirectOffsets = true }; @@ -73,14 +67,13 @@ template <typename Tensor, template <class> class MakePointer_> struct CoeffLoad } private: typedef typename Tensor::Scalar Scalar; - - typename MakePointer_<const Scalar>::Type m_data; + const Scalar* m_data; }; template<typename Scalar, typename Index, int side, typename Tensor, typename nocontract_t, typename contract_t, - int packet_size, bool inner_dim_contiguous, int Alignment, template <class> class MakePointer_ = MakePointer> + int packet_size, bool inner_dim_contiguous, int Alignment> class SimpleTensorContractionMapper { public: EIGEN_DEVICE_FUNC @@ -96,7 +89,7 @@ class SimpleTensorContractionMapper { m_k_strides(k_strides) { } enum { - DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>::DirectOffsets + DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess>::DirectOffsets }; EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { @@ -213,22 +206,23 @@ class SimpleTensorContractionMapper { } protected: - CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_> m_tensor; + CoeffLoader<Tensor, Tensor::RawAccess> m_tensor; const nocontract_t m_nocontract_strides; const nocontract_t m_ij_strides; const contract_t m_contract_strides; const contract_t m_k_strides; }; + template<typename Scalar, typename Index, int side, typename Tensor, typename nocontract_t, typename contract_t, int packet_size, bool inner_dim_contiguous, - bool inner_dim_reordered, int Alignment, template <class> class MakePointer_> -class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment, MakePointer_> + bool inner_dim_reordered, int Alignment> +class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment> { public: - typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment, MakePointer_> ParentMapper; + typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment> ParentMapper; EIGEN_DEVICE_FUNC BaseTensorContractionMapper(const Tensor& tensor, @@ -241,9 +235,9 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, typedef typename Tensor::PacketReturnType Packet; typedef typename unpacket_traits<Packet>::half HalfPacket; - template <typename PacketT,int AlignmentType> + template <int AlignmentType> EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const { + EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { // whole method makes column major assumption // don't need to add offsets for now (because operator handles that) @@ -281,13 +275,7 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, } data[packet_size - 1] = this->m_tensor.coeff(last); - return pload<PacketT>(data); - } - - template <int AlignmentType> - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { - return this->load<Packet,AlignmentType>(i,j); + return pload<Packet>(data); } template <int AlignmentType> @@ -313,11 +301,11 @@ template<typename Scalar, typename Index, int side, typename Tensor, typename nocontract_t, typename contract_t, bool inner_dim_contiguous, - bool inner_dim_reordered, int Alignment, template <class> class MakePointer_> -class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_> + bool inner_dim_reordered, int Alignment> +class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment> { public: - typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_> ParentMapper; + typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment> ParentMapper; EIGEN_DEVICE_FUNC BaseTensorContractionMapper(const Tensor& tensor, @@ -334,12 +322,6 @@ class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, con data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); return pload<typename Tensor::PacketReturnType>(data); } - template <typename PacketT,int> EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const { - EIGEN_ALIGN_MAX Scalar data[1]; - data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); - return pload<PacketT>(data); - } template <int> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const { return loadPacket(i, j); @@ -351,14 +333,14 @@ template<typename Scalar, typename Index, int side, typename Tensor, typename nocontract_t, typename contract_t, int packet_size, - bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, template <class> class MakePointer_=MakePointer> + bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment> class TensorContractionSubMapper { public: typedef typename Tensor::PacketReturnType Packet; typedef typename unpacket_traits<Packet>::half HalfPacket; - typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> ParentMapper; - typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Self; + typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper; + typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self; typedef Self LinearMapper; enum { @@ -403,14 +385,6 @@ class TensorContractionSubMapper { return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, j + m_horiz_offset); } - template <typename PacketT, int AlignmentType> - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const { - if (UseDirectOffsets) { - return m_base_mapper.template load<PacketT,AlignmentType>(i, j); - } - return m_base_mapper.template loadPacket<PacketT,AlignmentType>(i + m_vert_offset, j + m_horiz_offset); - } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { if (UseDirectOffsets) { return m_base_mapper.template loadHalfPacket<Alignment>(i, 0); @@ -418,7 +392,7 @@ class TensorContractionSubMapper { return m_base_mapper.template loadHalfPacket<Alignment>(i + m_vert_offset, m_horiz_offset); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet& p) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { if (UseDirectOffsets) { m_base_mapper.storePacket(i, 0, p); } @@ -458,14 +432,14 @@ template<typename Scalar_, typename Index, int side, typename Tensor, typename nocontract_t, typename contract_t, int packet_size, - bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, template <class> class MakePointer_=MakePointer> + bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment> class TensorContractionInputMapper - : public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> { + : public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> { public: typedef Scalar_ Scalar; - typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Base; - typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> SubMapper; + typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Base; + typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper; typedef SubMapper VectorMapper; EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor, diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h deleted file mode 100644 index e87de0c..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ /dev/null @@ -1,400 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: <eigen@codeplay.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/***************************************************************** - * TensorSyclConvertToDeviceExpression.h - * - * \brief: - * TensorContractionsycl - * -*****************************************************************/ - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H -namespace Eigen { - -template <typename Index, typename LhsScalar, typename RhsScalar,bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels; -template<typename Indices, typename LeftArgType, typename RightArgType> -struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, const Eigen::SyclDevice> : - public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, const Eigen::SyclDevice> > { - - typedef const Eigen::SyclDevice Device; - - typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self; - typedef TensorContractionEvaluatorBase<Self> Base; - typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType; - typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - - enum { - Layout = TensorEvaluator<LeftArgType, Device>::Layout, - }; - - // Most of the code is assuming that both input tensors are ColMajor. If the - // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: - // If we want to compute A * B = C, where A is LHS and B is RHS, the code - // will pretend B is LHS and A is RHS. - typedef typename internal::conditional< - static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional< - static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - - static const int LDims = - internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value; - static const int RDims = - internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value; - static const int ContractDims = internal::array_size<Indices>::value; - - typedef array<Index, LDims> left_dim_mapper_t; - typedef array<Index, RDims> right_dim_mapper_t; - - typedef array<Index, ContractDims> contract_t; - typedef array<Index, LDims - ContractDims> left_nocontract_t; - typedef array<Index, RDims - ContractDims> right_nocontract_t; - - static const int NumDims = LDims + RDims - 2 * ContractDims; - - typedef DSizes<Index, NumDims> Dimensions; - - // typedefs needed in evalTo - typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; - typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar; - - typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator; - typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; - - typedef typename LeftEvaluator::Dimensions LeftDimensions; - typedef typename RightEvaluator::Dimensions RightDimensions; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : - Base(op, device) {} - - // We need to redefine this method to make nvcc happy - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { - this->m_leftImpl.evalSubExprsIfNeeded(NULL); - this->m_rightImpl.evalSubExprsIfNeeded(NULL); - if (data) { - evalTo(data); - return false; - } else { - this->m_result = static_cast<Scalar*>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); - evalTo(this->m_result); - return true; - } - } - const Eigen::SyclDevice& device() const {return this->m_device;} - void evalTo(Scalar* buffer) const { - // Here is the result - if (this->m_lhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - evalTyped<true, true, true, Unaligned>(buffer); - } - else { - evalTyped<true, true, false, Unaligned>(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - evalTyped<true, false, true, Unaligned>(buffer); - } - else { - evalTyped<true, false, false, Unaligned>(buffer); - } - } - } - else { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - evalTyped<false, true, true, Unaligned>(buffer); - } - else { - evalTyped<false, true, false, Unaligned>(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - evalTyped<false, false, true, Unaligned>(buffer); - } - else { - evalTyped<false, false, false, Unaligned>(buffer); - } - } - } - } - - template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> - void evalTyped(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; - EIGEN_UNUSED_VARIABLE(k) - // rows in left side - const Index m = this->m_i_size; - // columns in right side - const Index n = this->m_j_size; - - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - LaunchSyclKernels<Index, LhsScalar, RhsScalar,lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>::Run(*this, buffer, m, n, k, - this->m_k_strides, this->m_left_contracting_strides, this->m_right_contracting_strides, - this->m_i_strides, this->m_j_strides, this->m_left_nocontract_strides, this->m_right_nocontract_strides); - } - // required by sycl to construct the expr on the device. Returns original left_impl - const TensorEvaluator<LeftArgType, Device>& left_impl() const { - return choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), this->m_leftImpl, this->m_rightImpl); - } - // required by sycl to construct the expr on the device. Returns original right_impl - const TensorEvaluator<RightArgType, Device>& right_impl() const { - return choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), this->m_rightImpl, this->m_leftImpl); - } -}; - -template <typename HostExpr, typename OutScalar, typename LhsScalar, typename RhsScalar, typename LHSFunctorExpr, typename RHSFunctorExpr, typename LhsLocalAcc, typename RhsLocalAcc, typename OutAccessor, typename Index, typename ContractT, typename LeftNocontractT, -typename RightNocontractT, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, -typename HostExpr::Index TileSizeDimM, typename HostExpr::Index TileSizeDimN,typename HostExpr::Index TileSizeDimK, typename HostExpr::Index WorkLoadPerThreadM,typename HostExpr::Index WorkLoadPerThreadN, -typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadSizeN, typename HostExpr::Index LoadPerThreadLhs, typename HostExpr::Index LoadPerThreadRhs, typename LHSTupleType, typename RHSTupleType, typename Device> struct KernelConstructor{ - typedef typename Eigen::internal::traits<HostExpr>::_LhsNested LHSHostExpr; - typedef typename Eigen::internal::traits<HostExpr>::_RhsNested RHSHostExpr; - typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<LHSHostExpr>::Type LHSPlaceHolderExpr; - typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<RHSHostExpr>::Type RHSPlaceHolderExpr; - LHSFunctorExpr lhs_functors; - RHSFunctorExpr rhs_functors; - LhsLocalAcc localLhs; - RhsLocalAcc localRhs; - OutAccessor out_res; - Index roundUpK, M, N, K; - ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides; - LeftNocontractT m_i_strides, m_left_nocontract_strides; - RightNocontractT m_j_strides, m_right_nocontract_strides; - LHSTupleType left_tuple_of_accessors; - RHSTupleType right_tuple_of_accessors; - Device dev; - - - KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_, - Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_, - ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_, - LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, LHSTupleType left_tuple_of_accessors_, RHSTupleType right_tuple_of_accessors_, Device dev_) - :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_), - m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_), - m_right_contracting_strides(m_right_contracting_strides_), - m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_), - m_j_strides(m_j_strides_), m_right_nocontract_strides(m_right_nocontract_strides_), - left_tuple_of_accessors(left_tuple_of_accessors_), right_tuple_of_accessors(right_tuple_of_accessors_), dev(dev_){} - - void operator()(cl::sycl::nd_item<1> itemID) { - typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; - typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<LHSHostExpr>::Type LHSDevExpr; - typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<RHSHostExpr>::Type RHSDevExpr; - auto lhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression<LHSDevExpr, LHSPlaceHolderExpr>(lhs_functors, left_tuple_of_accessors); - auto rhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression<RHSDevExpr, RHSPlaceHolderExpr>(rhs_functors, right_tuple_of_accessors); - typedef decltype(lhs_dev_expr.expr) LeftArgType; - typedef decltype(rhs_dev_expr.expr) RightArgType; - typedef typename internal::conditional<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator; - typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; - typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, - LeftEvaluator, LeftNocontractT, - ContractT, 1, - lhs_inner_dim_contiguous, - false, Unaligned, MakeGlobalPointer> LhsMapper; - - typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, - RightEvaluator, RightNocontractT, - ContractT, 1, - rhs_inner_dim_contiguous, - rhs_inner_dim_reordered, Unaligned, MakeGlobalPointer> RhsMapper; - // initialize data mappers must happen inside the kernel for device eval - LhsMapper lhs(LeftEvaluator(choose(Cond<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor)>(), - lhs_dev_expr.expr, rhs_dev_expr.expr), dev), m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides); - RhsMapper rhs(RightEvaluator(choose(Cond<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor)>(), - rhs_dev_expr.expr, lhs_dev_expr.expr),dev), m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides); - auto out_ptr = ConvertToActualTypeSycl(OutScalar, out_res); - // Matmul Kernel - // Thread identifiers - const Index mLocalThreadId = itemID.get_local(0); // Local ID row - const Index nLocalThreadId = itemID.get_local(1); // Local ID col - const Index mGroupId = itemID.get_group(0); // Work-group ID row - const Index nGroupId = itemID.get_group(1); // Work-group ID localCol - const Index linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID - // Allocate register space - float privateLhs; - float privateRhs[WorkLoadPerThreadN]; - float privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN]; - // Initialise the privateResumulation registers - for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) { - for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { - privateRes[wLPTM][wLPTN] = 0.0f; - } - } - - // Tile Lhs - for (Index lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) { - Index localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; - Index localLhsRow = localLhsLinearId% TileSizeDimM; - Index localLhsCol = localLhsLinearId/TileSizeDimM; - // Load the value (wide vector load) - Index GlobalLhsColId = TileSizeDimK*0 + localLhsCol; - localLhs[0 + ((localLhsCol*TileSizeDimM + localLhsRow)*2)] =((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId):static_cast<OutScalar>(0); - } - // Tile Rhs - for (Index lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) { - Index localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; - Index localRhsRow = localRhsLinearId% TileSizeDimN; - Index localRhsCol = localRhsLinearId/TileSizeDimN; - // Load the value (wide vector load) - Index GlobalRhsRowId = TileSizeDimK*0 + localRhsCol; - localRhs[0 + ((localRhsCol*TileSizeDimN + localRhsRow) *2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow): static_cast<OutScalar>(0); - - } - // Loop over all tiles - const Index numTiles = roundUpK/TileSizeDimK; - Index firstHalf=0; - do { - // Synchronise - itemID.barrier(cl::sycl::access::fence_space::local_space); - // Load the next tile of Lhs and Rhs into local memory - Index nextHalf = firstHalf + 1; - if (nextHalf < numTiles) { - // Tile A - for (Index lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) { - Index localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; - Index localLhsRow = localLhsLinearId% TileSizeDimM; - Index localLhsCol = localLhsLinearId/TileSizeDimM; - // global K id - Index GlobalLhsColId = TileSizeDimK*nextHalf + localLhsCol; - // Store the loaded value into local memory - localLhs[(nextHalf%2) + ((localLhsCol*TileSizeDimM + localLhsRow) *2)] = ((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId): static_cast<OutScalar>(0); - } - // Tile B - for (Index lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) { - Index localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId; - Index localRhsRow = localRhsLinearId% TileSizeDimN; - Index localRhsCol = localRhsLinearId/TileSizeDimN; - // Load the value (wide vector load) - Index GlobalRhsRowId = TileSizeDimK*nextHalf + localRhsCol; - // Store the loaded vector into local memory - localRhs[(nextHalf%2) +((localRhsCol*TileSizeDimN + localRhsRow)*2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow):static_cast<OutScalar>(0); - } - } - // Loop over the values of a single tile - for (Index k=0; k<TileSizeDimK; k++) { - // Cache the values of localRhs in registers - for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { - Index localRhsCol = nLocalThreadId + wLPTN*LocalThreadSizeN; - privateRhs[wLPTN] = localRhs[(firstHalf%2) +((k*TileSizeDimN + localRhsCol)*2)]; - } - // Perform the computation - for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) { - Index localLhsRow = mLocalThreadId + wLPTM*LocalThreadSizeM; - privateLhs = localLhs[(firstHalf%2)+ ((k*TileSizeDimM + localLhsRow)*2)]; - for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { - privateRes[wLPTM][wLPTN] += privateLhs * privateRhs[wLPTN]; - } - } - } - // Next tile - firstHalf++; - } while (firstHalf<numTiles); - - // Store the final results in C - for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) { - Index globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM; - if (globalRow< M){ - for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) { - Index globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN; - if(globalCol<N) - out_ptr[globalCol*M + globalRow] = privateRes[wLPTM][wLPTN]; - } - } - } - - } - -}; -template <typename Index, typename LhsScalar, typename RhsScalar, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels { - -static const Index TileSizeDimM = 32ul; // Tile size for dimension M -static const Index TileSizeDimN = 32ul; // Tile size for dimension N -static const Index TileSizeDimK = 16ul; // Tile size for dimension K -static const Index WorkLoadPerThreadM = 4ul; // Work load per thread in dimension M -static const Index WorkLoadPerThreadN = 4ul; // work load per thread in dimension N -static const Index LocalThreadSizeM = (TileSizeDimM/WorkLoadPerThreadM); // Local thread size for the first dimension (M here) -static const Index LocalThreadSizeN = (TileSizeDimN/WorkLoadPerThreadN); // Local thread size for the second dimension (N here) -static const Index LoadPerThreadLhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimN)); // workload per thread for Lhs expression -static const Index LoadPerThreadRhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimM)); // workload per thread for Rhs expression - -// RoundUp function to make sure that the global threadId is divisable by local threadId -static Index RoundUp(Index x, Index y) { - return ((((x) + (y) - 1) / (y))*(y)); -} - -template< typename Self, typename OutScalar, typename ContractT, typename LeftNocontractT, typename RightNocontractT> - static void Run(const Self& self, OutScalar* buffer, Index M, Index N, Index K, - ContractT m_k_strides, ContractT m_left_contracting_strides, ContractT m_right_contracting_strides, - LeftNocontractT m_i_strides, RightNocontractT m_j_strides, LeftNocontractT m_left_nocontract_strides, RightNocontractT m_right_nocontract_strides){ - - typedef typename Self::XprType HostExpr; - typedef typename Eigen::internal::traits<HostExpr>::_LhsNested LHSHostExpr; - typedef typename Eigen::internal::traits<HostExpr>::_RhsNested RHSHostExpr; - typedef TensorEvaluator<LHSHostExpr, const Eigen::SyclDevice> OrigLHSExpr; - typedef TensorEvaluator<RHSHostExpr, const Eigen::SyclDevice> OrigRHSExpr; - typedef Eigen::TensorSycl::internal::FunctorExtractor<OrigLHSExpr> LHSFunctorExpr; - typedef Eigen::TensorSycl::internal::FunctorExtractor<OrigRHSExpr> RHSFunctorExpr; - // extract lhs functor list - LHSFunctorExpr lhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl()); - // extract rhs functor list - RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl()); - - Index roundUpK = RoundUp(K, TileSizeDimK); - Index roundUpM = RoundUp(M, TileSizeDimM); - Index roundUpN = RoundUp(N, TileSizeDimN); - - self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) { - /// work-around for gcc bug - typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl())) LHSTupleType; - /// work-around for gcc bug - typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigRHSExpr>(cgh, self.right_impl())) RHSTupleType; - // create lhs tuple of accessors - LHSTupleType left_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl()); - // create rhs tuple of accessors - RHSTupleType right_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<OrigRHSExpr>(cgh, self.right_impl()); - - // Local memory for elements of Lhs - typedef cl::sycl::accessor<LhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> LhsLocalAcc; - LhsLocalAcc localLhs(cl::sycl::range<1>(2* TileSizeDimM * TileSizeDimK), cgh); - // Local memory for elements of Rhs - typedef cl::sycl::accessor<RhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> RhsLocalAcc; - RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh); - - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> OutAccessor; - //OutScalar memory - OutAccessor out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer); - - // sycl parallel for - cgh.parallel_for(cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN), - cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)), - KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, LHSFunctorExpr, RHSFunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT, - RightNocontractT, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, TileSizeDimM, TileSizeDimN, TileSizeDimK, - WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::DefaultDevice>(lhs_functors, rhs_functors, - localLhs, localRhs, out_res, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides, - m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::DefaultDevice())); - }); - self.device().asynchronousExec(); - } -}; - -} // end namespace Eigen -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index d30cc96..ee16cde 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -116,28 +116,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> void evalProduct(Scalar* buffer) const { - const Index m = this->m_i_size; - const Index n = this->m_j_size; - const Index k = this->m_k_size; - if (m == 0 || n == 0 || k == 0) return; - -#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) - if (this->m_can_use_xsmm) { - bool transposeA = !this->m_lhs_inner_dim_contiguous; - bool transposeB = !this->m_rhs_inner_dim_contiguous; - internal::TensorXsmmContractionBlocking<LhsScalar, RhsScalar, Index> - blocking(k, m, n, this->m_device.numThreads(), transposeA, - transposeB); - - if (blocking.num_threads() == 1) { - this->evalGemmXSMM(buffer); - } else { - ContextXsmm<Alignment>(this, buffer, m, n, k, blocking).run(); - } - return; - } -#endif - typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; @@ -169,7 +147,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT Traits::mr, Traits::nr, false, false> GebpKernel; - + const Index m = this->m_i_size; + const Index n = this->m_j_size; + const Index k = this->m_k_size; + if (m == 0 || n == 0 || k == 0) return; // Compute a set of algorithm parameters: // - kernel block sizes (bm, bn, bk) @@ -1063,187 +1044,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT rhsCost.dropMemoryCost(); return cost + lhsCost + rhsCost; } - -#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) - template<int Alignment> - class ContextXsmm { - public: - ContextXsmm(const Self* self, Scalar* buffer, Index m, Index n, Index k, - const internal::TensorXsmmContractionBlocking<LhsScalar, - RhsScalar, Index>& blocking): - device(self->m_device), - m(m), k(k), n(n), - stride_a(blocking.transposeA() ? k : m), - stride_b(blocking.transposeB() ? n : k), - stride_c(m), - bm(blocking.mc()), bk(blocking.kc()), bn(blocking.nc()), - blocks_m(blocking.blocks_m()), blocks_k(blocking.blocks_k()), - blocks_n(blocking.blocks_n()), - copyA(blocking.copyA()), copyB(blocking.copyB()), - transposeA(blocking.transposeA()), transposeB(blocking.transposeB()), - num_threads(blocking.num_threads()), - buffer(buffer), - leftData(self->m_leftImpl.data()), rightData(self->m_rightImpl.data()), - workers_done(blocking.num_threads()), - - packingA_jobs(0), packingB_jobs(0), compute_jobs(0), - packingA_done(blocking.blocks_m()), packingB_done(blocking.blocks_n()) {} - - void worker() { - // Pack - - if (copyA) { - while (true) { - uint32_t mk = packingA_jobs++; - Index mi = mk / blocks_k; - Index ki = mk % blocks_k; - if (mi >= blocks_m) break; - - LhsScalar * blockA = blocksA + (bk*bm) * (mi*blocks_k+ki); - if (transposeA) { - const LhsScalar * current_a = leftData + (bm*mi)*stride_a + (bk*ki); - libxsmm_otrans(blockA, current_a, sizeof(LhsScalar), actual_bk(ki), - actual_bm(mi), stride_a, bm); - } else { - const LhsScalar * current_a = leftData + (bk*ki)*stride_a + (bm*mi); - internal::pack_simple<LhsScalar, Index>(blockA, current_a, - actual_bk(ki), actual_bm(mi), bm, stride_a); - } - packingA_done.at(mi)++; - } - } - - if (copyB) { - while (true) { - uint32_t nk = packingB_jobs++; - Index ni = nk / blocks_k; - Index ki = nk % blocks_k; - if (ni >= blocks_n) break; - - RhsScalar * blockB = blocksB + (bk*bn) * (ni*blocks_k+ki); - if (transposeB) { - const RhsScalar * current_b = rightData + (ki*bk)*stride_b + - (ni*bn); - libxsmm_otrans(blockB, current_b, sizeof(RhsScalar), actual_bn(ni), - actual_bk(ki), stride_b, bk); - } else { - const RhsScalar * current_b = rightData + (ni*bn)*stride_b + - (ki*bk); - internal::pack_simple<RhsScalar, Index>(blockB, current_b, - actual_bn(ni), actual_bk(ki), bk, stride_b); - } - packingB_done.at(ni)++; - } - } - - // Compute - - while (true) { - uint32_t mn = compute_jobs++; - Index mi = mn / blocks_n; - Index ni = mn % blocks_n; - if (mi >= blocks_m) break; - - // Wait for mi, ni packings to be done. This is more fine-grained than - // waiting for all workers to finish packing. - while ((copyA && (packingA_done.at(mi) < blocks_k)) || - (copyB && (packingB_done.at(ni) < blocks_k))) - {} - - for (Index ki=0; ki < blocks_k; ++ki) { - const LhsScalar * current_a = copyA ? - blocksA + (bk*bm) * (mi*blocks_k+ki) : - leftData + (bk*ki)*stride_a + (bm*mi); - const RhsScalar * current_b = copyB ? - blocksB + (bk*bn) * (ni*blocks_k+ki) : - rightData + (ni*bn)*stride_b + (bk*ki); - - Index current_stride_a = copyA ? bm : stride_a; - Index current_stride_b = copyB ? bk : stride_b; - - // Memory may not be zeroed, overwrite instead of adding in first - // iteration. - float beta = ki == 0 ? 0 : 1; - - Scalar * current_c = buffer + (mi*bm) + (ni*bn)*stride_c; - internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>( - 0, actual_bm(mi), actual_bn(ni), actual_bk(ki), - current_stride_a, current_stride_b, stride_c, 1, beta, 0) - (current_a, current_b, current_c); - } - } - - workers_done.Notify(); - } - - void run() { - // Parallelization strategy. - // - // First pack A into blocks (sharding by m, k) and B (sharding by n,k), - // then shard by m, n. - // - // Do not use advanced ThreadPool queuing, just run a single long-standing - // function in each thread. - if (copyA) { - blocksA = static_cast<LhsScalar*>(device.allocate( - (blocks_m*bm)*(blocks_k*bk)*sizeof(LhsScalar))); - } - if (copyB) { - blocksB = static_cast<RhsScalar*>(device.allocate( - (blocks_n*bn)*(blocks_k*bk)*sizeof(RhsScalar))); - } - - for (Index i = 0; i < num_threads; ++i) { - device.enqueueNoNotification([=]() { worker(); }); - } - - workers_done.Wait(); - - if (copyA) { - device.deallocate(blocksA); - } - if (copyB) { - device.deallocate(blocksB); - } - } - - private: - // real block size for block index in [0, ..., blocks - 1]. - Index actual_bm(Index mi) const { - return mi != blocks_m - 1 ? bm : m + bm - bm * blocks_m; - } - Index actual_bk(Index ki) const { - return ki != blocks_k - 1 ? bk : k + bk - bk * blocks_k; - } - Index actual_bn(Index ni) const { - return ni != blocks_n - 1 ? bn : n + bn - bn * blocks_n; - } - - const Device& device; - Index m, k, n; - Index stride_a, stride_b, stride_c; - Index bm, bk, bn; // Block sizes. - Index blocks_m, blocks_k, blocks_n; // Number of blocks in each dimension. - bool copyA, copyB, transposeA, transposeB; - Index num_threads; - Scalar *buffer; - const LhsScalar *leftData; - const RhsScalar *rightData; - - LhsScalar *blocksA; - RhsScalar *blocksB; - // barrier for joining all threads after all done. - Barrier workers_done; - // "queues" of (mi,ki), (ki,ni), (mi,ni) jobs packed [0,p)x[0,q) -> [0, p*q) - std::atomic<uint32_t> packingA_jobs; - std::atomic<uint32_t> packingB_jobs; - std::atomic<uint32_t> compute_jobs; - // already packed blocks for each mi-panel in A and ni-panel in B. - std::vector<std::atomic<uint8_t>> packingA_done; - std::vector<std::atomic<uint8_t>> packingB_done; - }; -#endif - }; } // end namespace Eigen diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index b29968b..860a694 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -246,9 +246,6 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - /// required by sycl in order to extract the sycl accessor - const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } - protected: template <int LoadMode, bool ActuallyVectorize> struct PacketConv { diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 378f5cc..abdf742 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -100,7 +100,7 @@ class IndexMapper { } } else { for (int i = NumDims - 1; i >= 0; --i) { - if (static_cast<size_t>(i + 1) < offset) { + if (i + 1 < offset) { m_cudaInputStrides[i] = m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1]; m_cudaOutputStrides[i] = diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h deleted file mode 100644 index 4247c1c..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ /dev/null @@ -1,476 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: <eigen@codeplay.com> -// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> - -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H - -namespace Eigen { - -/** \class TensorConvolution - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor convolution class. - * - * - */ -template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index, -typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType> -struct EigenConvolutionKernel1D{ -typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; -internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper; -Kernel_accessor kernel_filter; -const size_t kernelSize, range_x, range_y; -Buffer_accessor buffer_acc; -Local_accessor local_acc; -FunctorExpr functors; -TupleType tuple_of_accessors; -EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper_, - Kernel_accessor kernel_filter_, const size_t kernelSize_, const size_t range_x_, const size_t range_y_, - Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) - :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize(kernelSize_), range_x(range_x_), range_y(range_y_), - buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} - - void operator()(cl::sycl::nd_item<2> itemID) { - typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; - auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); - - auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); - auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); - - const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize -1); //the required row to be calculated for the for each plane in shered memory - const size_t plane_kernel_offset = itemID.get_local(1) * num_x_input; - const size_t first_input_start = itemID.get_group(0)*itemID.get_local_range()[0]; - const size_t plane_tensor_offset =indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(1)); - /// fill the shared memory - for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) { - const size_t local_index = i + plane_kernel_offset ; - const size_t tensor_index = plane_tensor_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_input_start); - if(((i + first_input_start) < (range_x +kernelSize-1)) && itemID.get_global(1)< range_y){ - local_acc[local_index] = device_evaluator.coeff(tensor_index); - } - else local_acc[local_index]=0.0f; - } - - itemID.barrier(cl::sycl::access::fence_space::local_space); - - // calculate the convolution - const size_t first_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x - if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y){ - CoeffReturnType result = static_cast<CoeffReturnType>(0); - const size_t index = plane_kernel_offset+ itemID.get_local(0); - for (size_t k = 0; k < kernelSize; ++k) { - result += (local_acc[k + index] * kernel_ptr[k]); - } - const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(1)) - +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + first_output_start); - buffer_ptr[tensor_index] = result; - } - } -}; - - -template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index, -typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType> -struct EigenConvolutionKernel2D{ -typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; -internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper; -Kernel_accessor kernel_filter; -const size_t kernelSize_x, kernelSize_y, range_x, range_y , range_z; -Buffer_accessor buffer_acc; -Local_accessor local_acc; -FunctorExpr functors; -TupleType tuple_of_accessors; -EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper_, - Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ ,const size_t range_x_, const size_t range_y_, const size_t range_z_, - Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) - :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), range_x(range_x_), range_y(range_y_), range_z(range_z_), - buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} - - void operator()(cl::sycl::nd_item<3> itemID) { - typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; - auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); - - auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); - auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); - const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory - const size_t num_y_input = (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory - const size_t plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(2)); - const size_t plane_kernel_offset = itemID.get_local(2) * num_y_input; - - /// fill the shared memory - const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0]; - const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1]; - for (size_t j = itemID.get_local(1); j < num_y_input; j += itemID.get_local_range()[1]) { - const size_t local_input_offset = num_x_input * (j + plane_kernel_offset); - for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) { - const size_t local_index = i + local_input_offset; - const size_t tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_x_input_start, j+ first_y_input_start ); - if(((i + first_x_input_start) < (range_x +kernelSize_x-1)) &&((j + first_y_input_start) < (range_y +kernelSize_y-1)) && itemID.get_global(2)< range_z){ - local_acc[local_index] = device_evaluator.coeff(tensor_index); - } - else local_acc[local_index]=0.0f; - } - } - - itemID.barrier(cl::sycl::access::fence_space::local_space); - - // calculate the convolution - const size_t fitst_x_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x - const size_t fitst_y_output_start =itemID.get_group(1)*(itemID.get_local_range()[1]); // output start y - if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y && itemID.get_global(2)< range_z){ - CoeffReturnType result = static_cast<CoeffReturnType>(0); - for (size_t j = 0; j < kernelSize_y; j++) { - size_t kernel_offset =kernelSize_x * j; - const size_t index = (num_x_input*(plane_kernel_offset + j+ itemID.get_local(1))) + itemID.get_local(0); - for (size_t i = 0; i < kernelSize_x; i++) { - result += (local_acc[i + index] * kernel_ptr[i+kernel_offset]); - } - } - const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(2)) - +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start); - buffer_ptr[tensor_index] = result; - } - } -}; - - - -template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index, -typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType> -struct EigenConvolutionKernel3D{ -typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; -internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper; -Kernel_accessor kernel_filter; -const size_t kernelSize_x, kernelSize_y, kernelSize_z, range_x, range_y , range_z, numP; -Buffer_accessor buffer_acc; -Local_accessor local_acc; -FunctorExpr functors; -TupleType tuple_of_accessors; -EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper_, - Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ , const size_t kernelSize_z_ , - const size_t range_x_, const size_t range_y_, const size_t range_z_, const size_t numP_, - Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) - :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), - kernelSize_z(kernelSize_z_), range_x(range_x_), range_y(range_y_), range_z(range_z_), numP(numP_), - buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} - - void operator()(cl::sycl::nd_item<3> itemID) { - typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr; - auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); - - auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); - auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); - const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory - const size_t num_y_input = (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory - const size_t num_z_input = (itemID.get_local_range()[2] +kernelSize_z -1); //the required row to be calculated for the for each plane in shered memory - const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0]; - const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1]; - const size_t first_z_input_start = itemID.get_group(2)*itemID.get_local_range()[2]; - for(size_t p=0; p<numP; p++){ - /// fill the shared memory - const size_t plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); - for (size_t k = itemID.get_local(2); k < num_z_input; k += itemID.get_local_range()[2]) { - for (size_t j = itemID.get_local(1); j < num_y_input; j += itemID.get_local_range()[1]) { - for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) { - const size_t local_index = i + (num_x_input * (j + (num_y_input * k))); - const size_t tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_x_input_start, j+ first_y_input_start , k+ first_z_input_start ); - if(((i + first_x_input_start) < (range_x +kernelSize_x-1)) && ((j + first_y_input_start) < (range_y +kernelSize_y-1)) && ((k + first_z_input_start) < (range_z +kernelSize_z-1)) ){ - local_acc[local_index] = device_evaluator.coeff(tensor_index); - } - else local_acc[local_index]=0.0f; - } - } - } - itemID.barrier(cl::sycl::access::fence_space::local_space); - - // calculate the convolution - const size_t fitst_x_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // x - const size_t fitst_y_output_start =itemID.get_group(1)*(itemID.get_local_range()[1]); // y - const size_t fitst_z_output_start =itemID.get_group(2)*(itemID.get_local_range()[2]); // z - - if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y && itemID.get_global(2)< range_z){ - CoeffReturnType result = static_cast<CoeffReturnType>(0); - for (size_t k = 0; k < kernelSize_z; k++) { - for (size_t j = 0; j < kernelSize_y; j++) { - for (size_t i = 0; i < kernelSize_x; i++) { - const size_t kernel_index =i + kernelSize_x * (j + kernelSize_y * k); - const size_t local_index = ((i+ itemID.get_local(0))+ num_x_input*((j+ itemID.get_local(1)) + num_y_input * (k+ itemID.get_local(2)))); - result += (local_acc[local_index] * kernel_ptr[kernel_index]); - } - } - } - const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p) - +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start, itemID.get_local(2) + fitst_z_output_start ); - buffer_ptr[tensor_index] = result; - } - - itemID.barrier(cl::sycl::access::fence_space::local_space); - } - } -}; - - -template<typename Indices, typename InputArgType, typename KernelArgType> -struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, const Eigen::SyclDevice> -{ - typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType; - - static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Dimensions>::value; - static const int NumKernelDims = internal::array_size<Indices>::value; - typedef typename XprType::Index Index; - typedef DSizes<Index, NumDims> Dimensions; - typedef typename TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Dimensions KernelDimensions; - typedef const Eigen::SyclDevice Device; - - enum { - IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned, - PacketAccess = false, - Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Eigen::SyclDevice& device) - : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) - { - EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); - - const typename TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Dimensions& input_dims = m_inputImpl.dimensions(); - const typename TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions(); - - m_dimensions = m_inputImpl.dimensions(); - for (int i = 0; i < NumKernelDims; ++i) { - const Index index = op.indices()[i]; - const Index input_dim = input_dims[index]; - const Index kernel_dim = kernel_dims[i]; - const Index result_dim = input_dim - kernel_dim + 1; - m_dimensions[index] = result_dim; - } - } - - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, const Eigen::SyclDevice>::type PacketReturnType; - typedef typename InputArgType::Scalar Scalar; - static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { - preloadKernel(); - m_inputImpl.evalSubExprsIfNeeded(NULL); - if (data) { - executeEval(data); - return false; - } else { - m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)); - executeEval(m_buf); - return true; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_inputImpl.cleanup(); - if (m_buf) { - m_device.deallocate(m_buf); - m_buf = NULL; - } - if (m_local_kernel) { - m_device.deallocate((void*)m_kernel); - m_local_kernel = false; - } - m_kernel = NULL; - } - /// used by sycl in order to build the sycl buffer - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;} - /// used by sycl in order to build the sycl buffer - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return m_buf; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { - // Don't make a local copy of the kernel unless we have to (i.e. it's an - // expression that needs to be evaluated) - const Scalar* in_place = m_kernelImpl.data(); - if (in_place) { - m_kernel = in_place; - m_local_kernel = false; - } else { - size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); - Scalar* local = (Scalar*)m_device.allocate(kernel_sz); - typedef TensorEvalToOp<const KernelArgType> EvalTo; - EvalTo evalToTmp(local, m_kernelArg); - const bool PacketAccess = internal::IsVectorizable<const Eigen::SyclDevice, KernelArgType>::value; - internal::TensorExecutor<const EvalTo, const Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device); - m_kernel = local; - m_local_kernel = true; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(Scalar* data) const { - typedef TensorEvaluator<InputArgType, const Eigen::SyclDevice> InputEvaluator; - typedef typename InputEvaluator::Dimensions InputDims; - - typedef Eigen::TensorSycl::internal::FunctorExtractor<InputEvaluator> InputFunctorExpr; - // extract input functor list - InputFunctorExpr input_functors = Eigen::TensorSycl::internal::extractFunctors(m_inputImpl); - - - m_device.sycl_queue().submit([&](cl::sycl::handler &cgh) { - - typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> InputLocalAcc; - /// work-around for gcc 4.8 auto bug - typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl)) InputTupleType; - // create input tuple of accessors - InputTupleType tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl); - - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> OutputAccessorType; - OutputAccessorType out_res= m_device. template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, data); - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> KernelAccessorType; - KernelAccessorType kernel_acc= m_device. template get_sycl_accessor<cl::sycl::access::mode::read>(cgh, m_kernel); - - switch (NumKernelDims) { - case 1: { - const size_t numX = dimensions()[m_indices[0]]; - const size_t numP = dimensions().TotalSize() / numX; - const size_t kernel_size = m_kernelImpl.dimensions().TotalSize(); - size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y; - m_device.parallel_for_setup(numX, numP, tileSize_x,tileSize_y,range_x,range_y, GRange_x, GRange_y ); - const size_t shared_mem =(tileSize_x +kernel_size -1)*(tileSize_y); - assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock()); - auto global_range=cl::sycl::range<2>(GRange_x, GRange_y); // global range - auto local_range=cl::sycl::range<2>(tileSize_x, tileSize_y); // local range - InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh); - const array<Index, 1> indices{{m_indices[0]}}; - const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}}; - internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); - cgh.parallel_for(cl::sycl::nd_range<2>(global_range, local_range), - EigenConvolutionKernel1D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index, - InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>( - indexMapper,kernel_acc, kernel_size, numX, numP, out_res, local_acc, input_functors, tuple_of_accessors)); - break; - } - - case 2: { - const size_t idxX =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1; - const size_t idxY =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0; - const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX]; - const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY]; - const size_t numX = dimensions()[m_indices[idxX]]; - const size_t numY = dimensions()[m_indices[idxY]]; - const size_t numP = dimensions().TotalSize() / (numX*numY); - size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z; - m_device.parallel_for_setup(numX, numY, numP, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z ); - const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * tileSize_z; - assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock()); - auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range - auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range - InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh); - const array<Index, 2> indices {{m_indices[idxX], m_indices[idxY]}}; - const array<Index, 2> kernel_dims{{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY]}}; - internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); - cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range), - EigenConvolutionKernel2D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index, - InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>( - indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, local_acc, input_functors, tuple_of_accessors)); - break; - } - - case 3: { - const size_t idxX =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2; - const size_t idxY =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1; - const size_t idxZ =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0; - const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX]; - const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY]; - const size_t kernel_size_z = m_kernelImpl.dimensions()[idxZ]; - const size_t numX = dimensions()[m_indices[idxX]]; - const size_t numY = dimensions()[m_indices[idxY]]; - const size_t numZ = dimensions()[m_indices[idxZ]]; - const size_t numP = dimensions().TotalSize() / (numX*numY*numZ); - const array<Index, 3> indices{{m_indices[idxX], m_indices[idxY], m_indices[idxZ]}}; - const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[idxX],m_kernelImpl.dimensions()[idxY], m_kernelImpl.dimensions()[idxZ]}}; - internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); - size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z; - m_device.parallel_for_setup(numX, numY, numZ, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z ); - const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * (tileSize_z +kernel_size_y -1); - assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock()); - auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range - auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range - InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh); - cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range), - EigenConvolutionKernel3D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index, - InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>( - indexMapper,kernel_acc, kernel_size_x, kernel_size_y, kernel_size_z, numX, numY, - numZ, numP, out_res, local_acc, input_functors, tuple_of_accessors)); - break; - } - - default: { - EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); - } - } - }); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - eigen_assert(m_buf); - eigen_assert(index < m_dimensions.TotalSize()); - return m_buf[index]; - } - - template<int LoadMode> - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const - { - eigen_assert(m_buf); - eigen_assert(index < m_dimensions.TotalSize()); - return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost - // model. - const double kernel_size = m_kernelImpl.dimensions().TotalSize(); - // We ignore the use of fused multiply-add. - const double convolve_compute_cost = - TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>(); - const double firstIndex_compute_cost = - NumDims * - (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + - TensorOpCost::DivCost<Index>()); - return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + - kernel_size * (m_inputImpl.costPerCoeff(vectorized) + - m_kernelImpl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, convolve_compute_cost, vectorized, - PacketSize)); - } - - private: - // No assignment (copies are needed by the kernels) - TensorEvaluator& operator = (const TensorEvaluator&); - TensorEvaluator<InputArgType, const Eigen::SyclDevice> m_inputImpl; - KernelArgType m_kernelArg; - TensorEvaluator<KernelArgType, const Eigen::SyclDevice> m_kernelImpl; - Indices m_indices; - Dimensions m_dimensions; - Scalar* m_buf; - const Scalar* m_kernel; - bool m_local_kernel; - const Eigen::SyclDevice& m_device; -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index be8d693..4f5767b 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -88,7 +88,7 @@ static void initializeDeviceProp() { #if __cplusplus >= 201103L std::atomic_thread_fence(std::memory_order_acquire); #endif - EIGEN_SLEEP(1000); + sleep(1); } } } @@ -217,10 +217,7 @@ struct GpuDevice { EIGEN_UNUSED_VARIABLE(err) assert(err == cudaSuccess); #else - EIGEN_UNUSED_VARIABLE(dst); - EIGEN_UNUSED_VARIABLE(src); - EIGEN_UNUSED_VARIABLE(n); - eigen_assert(false && "The default device should be used instead to generate kernel code"); + eigen_assert(false && "The default device should be used instead to generate kernel code"); #endif } diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h index ccaaa6c..9d14139 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h @@ -45,7 +45,7 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { -#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__) +#ifndef __CUDA_ARCH__ // Running on the host CPU return l1CacheSize(); #else @@ -55,7 +55,7 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { -#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__) +#ifndef __CUDA_ARCH__ // Running single threaded on the host CPU return l3CacheSize(); #else diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index e209799..7c03989 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -16,400 +16,107 @@ #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H namespace Eigen { - - #define ConvertToActualTypeSycl(Scalar, buf_acc) reinterpret_cast<typename cl::sycl::global_ptr<Scalar>::pointer_t>((&(*buf_acc.get_pointer()))) - - template <typename Scalar, typename read_accessor, typename write_accessor> class MemCopyFunctor { - public: - MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset) : m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {} - - void operator()(cl::sycl::nd_item<1> itemID) { - auto src_ptr = ConvertToActualTypeSycl(Scalar, m_src_acc); - auto dst_ptr = ConvertToActualTypeSycl(Scalar, m_dst_acc); - auto globalid = itemID.get_global_linear_id(); - if (globalid < m_rng) { - dst_ptr[globalid + m_i] = src_ptr[globalid + m_offset]; - } - } - - private: - read_accessor m_src_acc; - write_accessor m_dst_acc; - size_t m_rng; - size_t m_i; - size_t m_offset; - }; - - struct memsetkernelFunctor{ - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> AccType; - AccType m_acc; - const size_t m_rng, m_c; - memsetkernelFunctor(AccType acc, const size_t rng, const size_t c):m_acc(acc), m_rng(rng), m_c(c){} - void operator()(cl::sycl::nd_item<1> itemID) { - auto globalid=itemID.get_global_linear_id(); - if (globalid< m_rng) m_acc[globalid] = m_c; - } - - }; - -EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){ - auto devices = cl::sycl::device::get_devices(); - std::vector<cl::sycl::device>::iterator it =devices.begin(); - while(it!=devices.end()) { - /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU ) - auto s= (*it).template get_info<cl::sycl::info::device::vendor>(); - std::transform(s.begin(), s.end(), s.begin(), ::tolower); - if((*it).is_cpu() && s.find("amd")!=std::string::npos && s.find("apu") == std::string::npos){ // remove amd cpu as it is not supported by computecpp allow APUs - it=devices.erase(it); - } - else{ - ++it; - } - } - return devices; -} - -struct QueueInterface { - /// class members: - bool exception_caught_ = false; - - mutable std::mutex mutex_; - +struct SyclDevice { + /// class members + /// sycl queue + mutable cl::sycl::queue m_queue; /// std::map is the container used to make sure that we create only one buffer /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. - mutable std::map<const uint8_t *, cl::sycl::buffer<uint8_t, 1>> buffer_map; - /// sycl queue - mutable cl::sycl::queue m_queue; - /// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured through dev_Selector typename - /// SyclStreamDevice is not owned. it is the caller's responsibility to destroy it. - template<typename dev_Selector> explicit QueueInterface(const dev_Selector& s): + mutable std::map<const void *, std::shared_ptr<void>> buffer_map; + /// creating device by using selector + template<typename dev_Selector> SyclDevice(dev_Selector s) + : #ifdef EIGEN_EXCEPTIONS - m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { + m_queue(cl::sycl::queue(s, [=](cl::sycl::exception_list l) { for (const auto& e : l) { try { - if (e) { - exception_caught_ = true; - std::rethrow_exception(e); - } + std::rethrow_exception(e); } catch (cl::sycl::exception e) { - std::cerr << e.what() << std::endl; - } + std::cout << e.what() << std::endl; + } } })) #else -m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { - for (const auto& e : l) { - if (e) { - exception_caught_ = true; - std::cerr << "Error detected Inside Sycl Device."<< std::endl; - - } - } -})) + m_queue(cl::sycl::queue(s)) #endif {} + // destructor + ~SyclDevice() { deallocate_all(); } - /// Allocating device pointer. This pointer is actually an 8 bytes host pointer used as key to access the sycl device buffer. - /// The reason is that we cannot use device buffer as a pointer as a m_data in Eigen leafNode expressions. So we create a key - /// pointer to be used in Eigen expression construction. When we convert the Eigen construction into the sycl construction we - /// use this pointer as a key in our buffer_map and we make sure that we dedicate only one buffer only for this pointer. - /// The device pointer would be deleted by calling deallocate function. - EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { - auto buf = cl::sycl::buffer<uint8_t,1>(cl::sycl::range<1>(num_bytes)); - auto ptr =buf.get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>().get_pointer(); - buf.set_final_data(nullptr); - std::lock_guard<std::mutex> lock(mutex_); - buffer_map.insert(std::pair<const uint8_t *, cl::sycl::buffer<uint8_t, 1>>(static_cast<const uint8_t*>(ptr),buf)); - return static_cast<void*>(ptr); - } - - /// This is used to deallocate the device pointer. p is used as a key inside - /// the map to find the device buffer and delete it. - EIGEN_STRONG_INLINE void deallocate(void *p) const { - std::lock_guard<std::mutex> lock(mutex_); - auto it = buffer_map.find(static_cast<const uint8_t*>(p)); + template <typename T> void deallocate(T *p) const { + auto it = buffer_map.find(p); if (it != buffer_map.end()) { buffer_map.erase(it); + internal::aligned_free(p); } } - - EIGEN_STRONG_INLINE void deallocate_all() const { - std::lock_guard<std::mutex> lock(mutex_); - buffer_map.clear(); - } - - EIGEN_STRONG_INLINE std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1>>::iterator find_buffer(const void* ptr) const { - std::lock_guard<std::mutex> lock(mutex_); - auto it1 = buffer_map.find(static_cast<const uint8_t*>(ptr)); - if (it1 != buffer_map.end()){ - return it1; - } - else{ - for(std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1>>::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){ - auto size = it->second.get_size(); - if((it->first < (static_cast<const uint8_t*>(ptr))) && ((static_cast<const uint8_t*>(ptr)) < (it->first + size)) ) return it; - } - } - std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl; - abort(); - } - - // This function checks if the runtime recorded an error for the - // underlying stream device. - EIGEN_STRONG_INLINE bool ok() const { - if (!exception_caught_) { - m_queue.wait_and_throw(); + void deallocate_all() const { + std::map<const void *, std::shared_ptr<void>>::iterator it=buffer_map.begin(); + while (it!=buffer_map.end()) { + auto p=it->first; + buffer_map.erase(it); + internal::aligned_free(const_cast<void*>(p)); + it=buffer_map.begin(); } - return !exception_caught_; + buffer_map.clear(); } - // destructor - ~QueueInterface() { buffer_map.clear(); } -}; - -struct SyclDevice { - // class member. - QueueInterface* m_queue_stream; - /// QueueInterface is not owned. it is the caller's responsibility to destroy it. - explicit SyclDevice(QueueInterface* queue_stream) : m_queue_stream(queue_stream){} - - /// Creation of sycl accessor for a buffer. This function first tries to find + /// creation of sycl accessor for a buffer. This function first tries to find /// the buffer in the buffer_map. If found it gets the accessor from it, if not, - /// the function then adds an entry by creating a sycl buffer for that particular pointer. - template <cl::sycl::access::mode AcMd> EIGEN_STRONG_INLINE cl::sycl::accessor<uint8_t, 1, AcMd, cl::sycl::access::target::global_buffer> - get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const { - return (get_sycl_buffer(ptr).template get_access<AcMd, cl::sycl::access::target::global_buffer>(cgh)); + ///the function then adds an entry by creating a sycl buffer for that particular pointer. + template <cl::sycl::access::mode AcMd, typename T> inline cl::sycl::accessor<T, 1, AcMd, cl::sycl::access::target::global_buffer> + get_sycl_accessor(size_t num_bytes, cl::sycl::handler &cgh, const T * ptr) const { + return (get_sycl_buffer<T>(num_bytes, ptr)->template get_access<AcMd, cl::sycl::access::target::global_buffer>(cgh)); } - /// Accessing the created sycl device buffer for the device pointer - EIGEN_STRONG_INLINE cl::sycl::buffer<uint8_t, 1>& get_sycl_buffer(const void * ptr) const { - return m_queue_stream->find_buffer(ptr)->second; + template<typename T> inline std::pair<std::map<const void *, std::shared_ptr<void>>::iterator,bool> add_sycl_buffer(const T *ptr, size_t num_bytes) const { + using Type = cl::sycl::buffer<T, 1>; + std::pair<std::map<const void *, std::shared_ptr<void>>::iterator,bool> ret = buffer_map.insert(std::pair<const void *, std::shared_ptr<void>>(ptr, std::shared_ptr<void>(new Type(cl::sycl::range<1>(num_bytes)), + [](void *dataMem) { delete static_cast<Type*>(dataMem); }))); + (static_cast<Type*>(buffer_map.at(ptr).get()))->set_final_data(nullptr); + return ret; } - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels - template<typename Index> - EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { - tileSize =static_cast<Index>(sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()); - auto s= sycl_queue().get_device().template get_info<cl::sycl::info::device::vendor>(); - std::transform(s.begin(), s.end(), s.begin(), ::tolower); - if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - tileSize=std::min(static_cast<Index>(256), static_cast<Index>(tileSize)); - } - rng = n; - if (rng==0) rng=static_cast<Index>(1); - GRange=rng; - if (tileSize>GRange) tileSize=GRange; - else if(GRange>tileSize){ - Index xMode = static_cast<Index>(GRange % tileSize); - if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode); - } - } - - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels - template<typename Index> - EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { - Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock()); - if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size)); - } - Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size)); - tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2))); - rng1=dim1; - if (rng1==0 ) rng1=static_cast<Index>(1); - GRange1=rng1; - if (tileSize1>GRange1) tileSize1=GRange1; - else if(GRange1>tileSize1){ - Index xMode = static_cast<Index>(GRange1 % tileSize1); - if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode); - } - tileSize0 = static_cast<Index>(max_workgroup_Size/tileSize1); - rng0 = dim0; - if (rng0==0 ) rng0=static_cast<Index>(1); - GRange0=rng0; - if (tileSize0>GRange0) tileSize0=GRange0; - else if(GRange0>tileSize0){ - Index xMode = static_cast<Index>(GRange0 % tileSize0); - if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode); - } + template <typename T> inline cl::sycl::buffer<T, 1>* get_sycl_buffer(size_t num_bytes,const T * ptr) const { + return static_cast<cl::sycl::buffer<T, 1>*>(add_sycl_buffer(ptr, num_bytes).first->second.get()); } - - - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels - template<typename Index> - EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { - Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock()); - if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size)); - } - Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size)); - tileSize2 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/3))); - rng2=dim2; - if (rng2==0 ) rng1=static_cast<Index>(1); - GRange2=rng2; - if (tileSize2>GRange2) tileSize2=GRange2; - else if(GRange2>tileSize2){ - Index xMode = static_cast<Index>(GRange2 % tileSize2); - if (xMode != 0) GRange2 += static_cast<Index>(tileSize2 - xMode); - } - pow_of_2 = static_cast<Index>(std::log2(static_cast<Index>(max_workgroup_Size/tileSize2))); - tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2))); - rng1=dim1; - if (rng1==0 ) rng1=static_cast<Index>(1); - GRange1=rng1; - if (tileSize1>GRange1) tileSize1=GRange1; - else if(GRange1>tileSize1){ - Index xMode = static_cast<Index>(GRange1 % tileSize1); - if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode); - } - tileSize0 = static_cast<Index>(max_workgroup_Size/(tileSize1*tileSize2)); - rng0 = dim0; - if (rng0==0 ) rng0=static_cast<Index>(1); - GRange0=rng0; - if (tileSize0>GRange0) tileSize0=GRange0; - else if(GRange0>tileSize0){ - Index xMode = static_cast<Index>(GRange0 % tileSize0); - if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode); - } - } - /// allocate device memory - EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { - return m_queue_stream->allocate(num_bytes); + /// allocating memory on the cpu + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void *allocate(size_t) const { + return internal::aligned_malloc(8); } - /// deallocate device memory - EIGEN_STRONG_INLINE void deallocate(void *p) const { - m_queue_stream->deallocate(p); - } // some runtime conditions that can be applied here - EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; } + bool isDeviceSuitable() const { return true; } - /// the memcpy function - template<typename Index> EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { - auto it1 = m_queue_stream->find_buffer(static_cast<const void*>(src)); - auto it2 = m_queue_stream->find_buffer(dst); - auto offset= (static_cast<const uint8_t*>(static_cast<const void*>(src))) - it1->first; - auto i= (static_cast<const uint8_t*>(dst)) - it2->first; - offset/=sizeof(Index); - i/=sizeof(Index); - size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); - sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto src_acc =it1->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh); - auto dst_acc =it2->second.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh); - typedef decltype(src_acc) read_accessor; - typedef decltype(dst_acc) write_accessor; - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, i, offset)); - }); - synchronize(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const { + ::memcpy(dst, src, n); } - /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device - /// pointer created as a key we find the sycl buffer and get the host accessor with discard_write mode - /// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the - /// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that - /// this buffer is accessed, the data will be copied to the device. - template<typename Index> EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { - auto host_acc= get_sycl_buffer(dst). template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>(); - ::memcpy(host_acc.get_pointer(), src, n); + template<typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(T *dst, const T *src, size_t n) const { + auto host_acc= (static_cast<cl::sycl::buffer<T, 1>*>(add_sycl_buffer(dst, n).first->second.get()))-> template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>(); + memcpy(host_acc.get_pointer(), src, n); } - /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl - /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the - /// lifespan of the memcpyDeviceToHost function. We create a kernel to copy the data, from the device- only source buffer to the destination - /// buffer with map_allocator on the gpu in parallel. At the end of the function call the destination buffer would be destroyed and the data - /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back - /// to the cpu only once per function call. - template<typename Index> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { - auto it = m_queue_stream->find_buffer(src); - auto offset =static_cast<const uint8_t*>(static_cast<const void*>(src))- it->first; - offset/=sizeof(Index); - size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); - // Assuming that the dst is the start of the destination pointer - auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(n)); - sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh); - auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh); - typedef decltype(src_acc) read_accessor; - typedef decltype(dst_acc) write_accessor; - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, 0, offset)); - }); - synchronize(); - } - /// returning the sycl queue - EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;} - /// Here is the implementation of memset function on sycl. - EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { - size_t rng, GRange, tileSize; - parallel_for_setup(n, tileSize, rng, GRange); - sycl_queue().submit(memsetCghFunctor(get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(data))),rng, GRange, tileSize, c )); - synchronize(); - } - - struct memsetCghFunctor{ - cl::sycl::buffer<uint8_t, 1>& m_buf; - const size_t& rng , GRange, tileSize; - const int &c; - memsetCghFunctor(cl::sycl::buffer<uint8_t, 1>& buff, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_) - :m_buf(buff), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){} - - void operator()(cl::sycl::handler &cgh) const { - auto buf_acc = m_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh); - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, rng, c)); + /// whith the current implementation of sycl, the data is copied twice from device to host. This will be fixed soon. + template<typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(T *dst, const T *src, size_t n) const { + auto it = buffer_map.find(src); + if (it != buffer_map.end()) { + auto host_acc= (static_cast<cl::sycl::buffer<T, 1>*>(it->second.get()))-> template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::host_buffer>(); + memcpy(dst,host_acc.get_pointer(), n); + } else{ + eigen_assert("no device memory found. The memory might be destroyed before creation"); } - }; - - EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { - // FIXME - return 48*1024; - } - - EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { - // We won't try to take advantage of the l2 cache for the time being, and - // there is no l3 cache on cuda devices. - return firstLevelCacheSize(); - } - EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { - return sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_compute_units>(); - // return stream_->deviceProperties().multiProcessorCount; } - EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { - return sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>(); - // return stream_->deviceProperties().maxThreadsPerBlock; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void *buffer, int c, size_t n) const { + ::memset(buffer, c, n); } - EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { - // OpenCL doesnot have such concept - return 2;//sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>(); - // return stream_->deviceProperties().maxThreadsPerMultiProcessor; - } - EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { - return sycl_queue().get_device(). template get_info<cl::sycl::info::device::local_mem_size>(); - // return stream_->deviceProperties().sharedMemPerBlock; - } - /// No need for sycl it should act the same as CPU version - EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } - - EIGEN_STRONG_INLINE void synchronize() const { - sycl_queue().wait_and_throw(); //pass - } - - EIGEN_STRONG_INLINE void asynchronousExec() const { - ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. - //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled - sycl_queue().wait_and_throw(); //pass - - } - // This function checks if the runtime recorded an error for the - // underlying stream device. - EIGEN_STRONG_INLINE bool ok() const { - return m_queue_stream->ok(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { + return 1; } }; - - } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index 16180ca..069680a 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -12,6 +12,17 @@ namespace Eigen { +// Use the SimpleThreadPool by default. We'll switch to the new non blocking +// thread pool later. +#ifndef EIGEN_USE_SIMPLE_THREAD_POOL +template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>; +typedef NonBlockingThreadPool ThreadPool; +#else +template <typename Env> using ThreadPoolTempl = SimpleThreadPoolTempl<Env>; +typedef SimpleThreadPool ThreadPool; +#endif + + // Barrier is an object that allows one or more threads to wait until // Notify has been called a specified number of times. class Barrier { @@ -245,7 +256,7 @@ struct ThreadPoolDevice { // Split into halves and submit to the pool. Index mid = first + divup((last - first) / 2, block_size) * block_size; pool_->Schedule([=, &handleRange]() { handleRange(mid, last); }); - handleRange(first, mid); + pool_->Schedule([=, &handleRange]() { handleRange(first, mid); }); }; handleRange(0, n); barrier.Wait(); diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 86405e6..b24cdeb 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -33,7 +33,7 @@ namespace Eigen { namespace internal { template<std::size_t n, typename Dimension> struct dget { - static const std::ptrdiff_t value = get<n, Dimension>::value; + static const std::size_t value = get<n, Dimension>::value; }; @@ -90,11 +90,9 @@ struct fixed_size_tensor_index_extraction_helper<Index, 0> // Fixed size #ifndef EIGEN_EMULATE_CXX11_META_H template <typename std::ptrdiff_t... Indices> -struct Sizes { +struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> { typedef internal::numeric_list<std::ptrdiff_t, Indices...> Base; - const Base t = Base(); static const std::ptrdiff_t total_size = internal::arg_prod(Indices...); - static const size_t count = Base::count; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const { return Base::count; @@ -122,16 +120,16 @@ struct Sizes { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::size_t index) const { - return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, t); + return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, *this); } template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const { - return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, t); + return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, *static_cast<const Base*>(this)); } template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const { - return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, t); + return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, *static_cast<const Base*>(this)); } }; diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 82dd1e6..0698713 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -41,9 +41,6 @@ struct traits<TensorEvalToOp<XprType, MakePointer_> > // Intermediate typedef to workaround MSVC issue. typedef MakePointer_<T> MakePointerT; typedef typename MakePointerT::Type Type; - typedef typename MakePointerT::RefType RefType; - - }; }; @@ -120,7 +117,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& op() const { return m_op; } - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() { } diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index d641581..834ce07 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -32,7 +32,6 @@ struct TensorEvaluator typedef typename Derived::Scalar CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; typedef typename Derived::Dimensions Dimensions; - typedef Derived XprType; // NumDimensions is -1 for variable dim tensors static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ? @@ -69,9 +68,7 @@ struct TensorEvaluator return m_data[index]; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename internal::traits<Derived>::template MakePointer<Scalar>::RefType - coeffRef(Index index) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { eigen_assert(m_data); return m_data[index]; } @@ -97,9 +94,7 @@ struct TensorEvaluator } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename internal::traits<Derived>::template MakePointer<Scalar>::RefType - coeffRef(const array<DenseIndex, NumCoords>& coords) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<DenseIndex, NumCoords>& coords) { eigen_assert(m_data); if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { return m_data[m_dims.IndexOfColMajor(coords)]; @@ -157,8 +152,6 @@ struct TensorEvaluator<const Derived, Device> typedef typename Derived::Scalar CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; typedef typename Derived::Dimensions Dimensions; - typedef const Derived XprType; - // NumDimensions is -1 for variable dim tensors static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ? diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index f060191..08eb559 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -253,7 +253,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D // get data into line_buf const Index stride = m_strides[dim]; if (stride == 1) { - m_device.memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar)); + memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar)); } else { Index offset = base_offset; for (int j = 0; j < line_len; ++j, offset += stride) { @@ -271,7 +271,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D // write back if (FFTDir == FFT_FORWARD && stride == 1) { - m_device.memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar)); + memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar)); } else { Index offset = base_offset; const ComplexScalar div_factor = ComplexScalar(1.0 / line_len, 0); diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index abe85c8..bbd5eb3 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -26,8 +26,8 @@ namespace Eigen { /// Therefore, by adding the default value, we managed to convert the type and it does not break any /// existing code as its default value is T*. namespace internal { -template<typename XprType> -struct traits<TensorForcedEvalOp<XprType> > +template<typename XprType, template <class> class MakePointer_> +struct traits<TensorForcedEvalOp<XprType, MakePointer_> > { // Type promotion to handle the case where the types of the lhs and the rhs are different. typedef typename XprType::Scalar Scalar; @@ -42,26 +42,31 @@ struct traits<TensorForcedEvalOp<XprType> > enum { Flags = 0 }; + template <class T> struct MakePointer { + // Intermediate typedef to workaround MSVC issue. + typedef MakePointer_<T> MakePointerT; + typedef typename MakePointerT::Type Type; + }; }; -template<typename XprType> -struct eval<TensorForcedEvalOp<XprType>, Eigen::Dense> +template<typename XprType, template <class> class MakePointer_> +struct eval<TensorForcedEvalOp<XprType, MakePointer_>, Eigen::Dense> { - typedef const TensorForcedEvalOp<XprType>& type; + typedef const TensorForcedEvalOp<XprType, MakePointer_>& type; }; -template<typename XprType> -struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<XprType> >::type> +template<typename XprType, template <class> class MakePointer_> +struct nested<TensorForcedEvalOp<XprType, MakePointer_>, 1, typename eval<TensorForcedEvalOp<XprType, MakePointer_> >::type> { - typedef TensorForcedEvalOp<XprType> type; + typedef TensorForcedEvalOp<XprType, MakePointer_> type; }; } // end namespace internal -template<typename XprType> -class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOnlyAccessors> +template<typename XprType, template <class> class MakePointer_> +class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType, MakePointer_>, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar; @@ -83,10 +88,10 @@ class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOn }; -template<typename ArgType, typename Device> -struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> +template<typename ArgType, typename Device, template <class> class MakePointer_> +struct TensorEvaluator<const TensorForcedEvalOp<ArgType, MakePointer_>, Device> { - typedef TensorForcedEvalOp<ArgType> XprType; + typedef TensorForcedEvalOp<ArgType, MakePointer_> XprType; typedef typename ArgType::Scalar Scalar; typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; typedef typename XprType::Index Index; @@ -102,7 +107,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - /// op_ is used for sycl + /// op_ is used for sycl : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) { } @@ -143,17 +148,17 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device> return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return m_buffer; } + EIGEN_DEVICE_FUNC typename MakePointer<Scalar>::Type data() const { return m_buffer; } /// required by sycl in order to extract the sycl accessor - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() { return m_impl; } + const TensorEvaluator<ArgType, Device>& impl() { return m_impl; } /// used by sycl in order to build the sycl buffer - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;} + const Device& device() const{return m_device;} private: TensorEvaluator<ArgType, Device> m_impl; const ArgType m_op; const Device& m_device; - CoeffReturnType* m_buffer; + typename MakePointer<CoeffReturnType>::Type m_buffer; }; diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 2e63899..52b803d 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -20,19 +20,7 @@ namespace Eigen { // map_allocator. template<typename T> struct MakePointer { typedef T* Type; - typedef T& RefType; }; -#if defined(EIGEN_USE_SYCL) -namespace TensorSycl { -namespace internal{ -template <typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor; -template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Op, typename Dims, typename Index, typename TupleType> -class FullReductionKernelFunctor; -} -} -#endif - - template<typename PlainObjectType, int Options_ = Unaligned, template <class> class MakePointer_ = MakePointer> class TensorMap; template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType = DenseIndex> class Tensor; @@ -75,7 +63,7 @@ template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp; template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp; template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorEvalToOp; -template<typename XprType> class TensorForcedEvalOp; +template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorForcedEvalOp; template<typename ExpressionType, typename DeviceType> class TensorDevice; template<typename Derived, typename Device> struct TensorEvaluator; diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 3b4f8ed..d73f6dc 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -33,7 +33,7 @@ struct functor_traits<scalar_mod_op<Scalar> > */ template <typename Scalar> struct scalar_mod2_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op) + EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op); EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; } }; template <typename Scalar> @@ -42,7 +42,7 @@ struct functor_traits<scalar_mod2_op<Scalar> > template <typename Scalar> struct scalar_fmod_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op) + EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op); EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const { return numext::fmod(a, b); diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index ef1c9c4..ede3939 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -37,8 +37,6 @@ namespace { { #ifdef __CUDA_ARCH__ return __clz(val); -#elif defined(__SYCL_DEVICE_ONLY__) - return cl::sycl::clz(val); #elif EIGEN_COMP_MSVC unsigned long index; _BitScanReverse(&index, val); @@ -55,8 +53,6 @@ namespace { { #ifdef __CUDA_ARCH__ return __clzll(val); -#elif defined(__SYCL_DEVICE_ONLY__) - return cl::sycl::clz(val); #elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64 unsigned long index; _BitScanReverse64(&index, val); @@ -92,8 +88,6 @@ namespace { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) { #if defined(__CUDA_ARCH__) return __umulhi(a, b); -#elif defined(__SYCL_DEVICE_ONLY__) - return cl::sycl::mul_hi(a, static_cast<uint32_t>(b)); #else return (static_cast<uint64_t>(a) * b) >> 32; #endif @@ -103,8 +97,6 @@ namespace { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { #if defined(__CUDA_ARCH__) return __umul64hi(a, b); -#elif defined(__SYCL_DEVICE_ONLY__) - return cl::sycl::mul_hi(a, static_cast<uint64_t>(b)); #elif defined(__SIZEOF_INT128__) __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b); return static_cast<uint64_t>(v >> 64); @@ -124,7 +116,7 @@ namespace { template <typename T> struct DividerHelper<64, T> { static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) { -#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__) +#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__) return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1); #else const uint64_t shift = 1ULL << log_div; @@ -205,8 +197,6 @@ class TensorIntDivisor<int32_t, true> { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const { #ifdef __CUDA_ARCH__ return (__umulhi(magic, n) >> shift); -#elif defined(__SYCL_DEVICE_ONLY__) - return (cl::sycl::mul_hi(static_cast<uint64_t>(magic), static_cast<uint64_t>(n)) >> shift); #else uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n); return (static_cast<uint32_t>(v >> 32) >> shift); diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h index f92e39d..ee0078b 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h @@ -51,12 +51,4 @@ #endif -#if EIGEN_OS_WIN || EIGEN_OS_WIN64 -#define EIGEN_SLEEP(n) Sleep(n) -#elif EIGEN_OS_GNULINUX -#define EIGEN_SLEEP(n) usleep(n * 1000); -#else -#define EIGEN_SLEEP(n) sleep(std::max<unsigned>(1, n/1000)) -#endif - #endif diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index b5ef31d..615559d 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -75,7 +75,6 @@ struct PacketType<half, GpuDevice> { HasSqrt = 1, HasRsqrt = 1, HasExp = 1, - HasExpm1 = 0, HasLog = 1, HasLog1p = 0, HasLog10 = 0, @@ -169,12 +168,12 @@ template <typename Idx> struct IndexPair { #ifdef EIGEN_HAS_SFINAE namespace internal { - template<typename IndexType, typename Index, Index... Is> + template<typename IndexType, Index... Is> EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array<Index, sizeof...(Is)> customIndices2Array(IndexType& idx, numeric_list<Index, Is...>) { return { idx[Is]... }; } - template<typename IndexType, typename Index> + template<typename IndexType> EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array<Index, 0> customIndices2Array(IndexType&, numeric_list<Index>) { return array<Index, 0>(); diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 6ddd2ca..d34f1e3 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -299,16 +299,6 @@ template <typename Index> struct MemcpyTriggerForSlicing<Index, GpuDevice> { EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; } }; #endif - -// It is very expensive to start the memcpy kernel on GPU: we therefore only -// use it for large copies. -#ifdef EIGEN_USE_SYCL -template <typename Index> struct MemcpyTriggerForSlicing<Index, const Eigen::SyclDevice> { - EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) { } - EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; } -}; -#endif - } // Eval as rvalue @@ -503,14 +493,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi } return NULL; } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const{ - return m_impl; - } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& startIndices() const{ - return m_offsets; - } + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { @@ -711,12 +694,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, { typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType; static const int NumDims = internal::array_size<Strides>::value; - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename internal::remove_const<Scalar>::type ScalarNonConst; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - typedef Strides Dimensions; enum { // Alignment can't be guaranteed at compile time since it depends on the @@ -729,7 +706,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_device(device), m_strides(op.strides()), m_exprStartIndices(op.startIndices()), m_exprStopIndices(op.stopIndices()) + : m_impl(op.expression(), device), m_device(device), m_strides(op.strides()) { // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero DSizes<Index,NumDims> startIndicesClamped, stopIndicesClamped; @@ -739,7 +716,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]); stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]); }else{ - /* implies m_strides[i]<0 by assert */ + /* implies m_strides[i]<0 by assert */ startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1); stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1); } @@ -802,6 +779,13 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, sizeof(Scalar)); } + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const<Scalar>::type ScalarNonConst; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; + typedef Strides Dimensions; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -827,15 +811,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, return NULL; } - //use by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& exprStartIndices() const { return m_exprStartIndices; } - //use by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& exprStopIndices() const { return m_exprStopIndices; } - //use by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& strides() const { return m_strides; } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const{return m_impl;} - protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { @@ -857,11 +832,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, } static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) { -#ifndef __SYCL_DEVICE_ONLY__ return numext::maxi(min, numext::mini(max,value)); -#else - return cl::sycl::clamp(value, min, max); -#endif } array<Index, NumDims> m_outputStrides; @@ -874,10 +845,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, DSizes<Index, NumDims> m_offsets; // offset in a flattened shape const Strides m_strides; std::size_t m_block_total_size_max; - //use by sycl - const StartIndices m_exprStartIndices; - //use by sycl - const StopIndices m_exprStopIndices; }; // Eval as lvalue diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index a8e2552..647bcf1 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -200,13 +200,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PaddingDimensions& padding() const { return m_padding; } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& padding_value() const { return m_paddingValue; } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const{return m_impl;} - private: EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim( Index index, int dim_index) const { diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index e341e2e..41d0d00 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -11,20 +11,8 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H -// clang is incompatible with the CUDA syntax wrt making a kernel a class friend, -// so we'll use a macro to make clang happy. -#ifndef KERNEL_FRIEND -#if defined(__clang__) && defined(__CUDA__) -#define KERNEL_FRIEND friend __global__ -#else -#define KERNEL_FRIEND friend -#endif -#endif - - namespace Eigen { - /** \class TensorReduction * \ingroup CXX11_Tensor_Module * @@ -692,23 +680,17 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, template <typename S, typename O, bool V> friend struct internal::FullReducerShard; #endif #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) - template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); + template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); #ifdef EIGEN_HAS_CUDA_FP16 - template <typename S, typename R, typename I> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); - template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); - template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*); -#endif - template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); - - template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); + template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); + template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); + template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*); #endif + template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); -#if defined(EIGEN_USE_SYCL) - template < typename HostExpr_, typename FunctorExpr_, typename Tuple_of_Acc_, typename Dims_, typename Op_, typename Index_> friend class TensorSycl::internal::ReductionFunctor; - template<typename CoeffReturnType_ ,typename OutAccessor_, typename HostExpr_, typename FunctorExpr_, typename Op_, typename Dims_, typename Index_, typename TupleType_> friend class TensorSycl::internal::FullReductionKernelFunctor; + template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); #endif - template <typename S, typename O, typename D> friend struct internal::InnerReducer; // Returns the Index in the input tensor of the first value that needs to be diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index edb0ab2..65638b6 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -287,6 +287,7 @@ struct FullReductionLauncher< void>::type> { static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) { typedef typename Self::Index Index; + typedef typename Self::CoeffReturnType Scalar; const int block_size = 256; const int num_per_thread = 128; const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread); diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index c3ca129..3daecb0 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -25,28 +25,61 @@ namespace Eigen { namespace internal { -template<typename OP, typename CoeffReturnType> struct syclGenericBufferReducer{ +template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{ template<typename BufferTOut, typename BufferTIn> -static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ +static void run(BufferTOut* bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ do { - auto f = [length, local, op, &bufOut, &bufI](cl::sycl::handler& h) mutable { + auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable { cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)}, cl::sycl::range<1>{std::min(length, local)}}; /* Two accessors are used: one to the buffer that is being reduced, * and a second to local memory, used to store intermediate data. */ - auto aI =bufI.template get_access<cl::sycl::access::mode::read_write>(h); - auto aOut =bufOut.template get_access<cl::sycl::access::mode::discard_write>(h); - typedef decltype(aI) InputAccessor; - typedef decltype(aOut) OutputAccessor; - typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,cl::sycl::access::target::local> LocalAccessor; - LocalAccessor scratch(cl::sycl::range<1>(local), h); + auto aI = + bufI.template get_access<cl::sycl::access::mode::read_write>(h); + auto aOut = + bufOut->template get_access<cl::sycl::access::mode::discard_write>(h); + cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, + cl::sycl::access::target::local> + scratch(cl::sycl::range<1>(local), h); /* The parallel_for invocation chosen is the variant with an nd_item * parameter, since the code requires barriers for correctness. */ - h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, aI, scratch, length, local)); + h.parallel_for<KernelName>( + r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) { + size_t globalid = id.get_global(0); + size_t localid = id.get_local(0); + /* All threads collectively read from global memory into local. + * The barrier ensures all threads' IO is resolved before + * execution continues (strictly speaking, all threads within + * a single work-group - there is no co-ordination between + * work-groups, only work-items). */ + if (globalid < length) { + scratch[localid] = aI[globalid]; + } + id.barrier(cl::sycl::access::fence_space::local_space); + + /* Apply the reduction operation between the current local + * id and the one on the other half of the vector. */ + if (globalid < length) { + int min = (length < local) ? length : local; + for (size_t offset = min / 2; offset > 0; offset /= 2) { + if (localid < offset) { + scratch[localid] += scratch[localid + offset]; + } + id.barrier(cl::sycl::access::fence_space::local_space); + } + /* The final result will be stored in local id 0. */ + if (localid == 0) { + aI[id.get_group(0)] = scratch[localid]; + if((length<=local) && globalid ==0){ + aOut[globalid]=scratch[localid]; + } + } + } + }); }; - dev.sycl_queue().submit(f); - dev.asynchronousExec(); + dev.m_queue.submit(f); + dev.m_queue.throw_asynchronous(); /* At this point, you could queue::wait_and_throw() to ensure that * errors are caught quickly. However, this would likely impact @@ -54,23 +87,18 @@ static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDev length = length / local; } while (length > 1); -} -}; -template<typename CoeffReturnType> struct syclGenericBufferReducer<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType>{ -template<typename BufferTOut, typename BufferTIn> -static void run(Eigen::internal::MeanReducer<CoeffReturnType>, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ - syclGenericBufferReducer<Eigen::internal::SumReducer<CoeffReturnType>, CoeffReturnType>::run(Eigen::internal::SumReducer<CoeffReturnType>(), - bufOut, bufI, dev, length, local); + } + }; +/// For now let's start with a full reducer /// Self is useless here because in expression construction we are going to treat reduction as a leafnode. /// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the /// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as // a leafNode. - template <typename Self, typename Op, bool Vectorizable> struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> { @@ -79,8 +107,8 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> { static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) { typedef const typename Self::ChildType HostExpr; /// this is the child of reduction - typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr; - FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl()); + typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; + auto functors = TensorSycl::internal::extractFunctors(self.impl()); int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread. size_t inputSize =self.impl().dimensions().TotalSize(); size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input @@ -88,7 +116,7 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> { if(rng ==0) { red_factor=1; }; - size_t tileSize =dev.sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2; + size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2; size_t GRange=std::max((size_t )1, rng); // convert global range to power of 2 for redecution @@ -105,66 +133,105 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> { size_t outTileSize = tileSize; /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one. if (GRange < outTileSize) outTileSize=GRange; + // getting final out buffer at the moment the created buffer is true because there is no need for assign + auto out_buffer =dev.template get_sycl_buffer<typename Eigen::internal::remove_all<CoeffReturnType>::type>(self.dimensions().TotalSize(), output); /// creating the shared memory for calculating reduction. /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can /// recursively apply reduction on it in order to reduce the whole. auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange)); typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims; - // Dims dims= self.xprDims(); - //Op functor = reducer; - dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { - // this is a workaround for gcc 4.8 bug - typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) TupleType; + Dims dims= self.xprDims(); + Op functor = reducer; + dev.m_queue.submit([&](cl::sycl::handler &cgh) { // create a tuple of accessors from Evaluator - TupleType tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); + auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh); - typedef decltype(tmp_global_accessor) OutAccessor; - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), - TensorSycl::internal::FullReductionKernelFunctor<CoeffReturnType, OutAccessor, HostExpr, FunctorExpr, Op, Dims, size_t, TupleType> - (tmp_global_accessor, rng, remaining, red_factor, reducer, self.xprDims(), functors, tuple_of_accessors)); + + cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) { + typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr; + auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); + /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour + /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the + /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here. + const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor); + /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is + /// the device_evaluator is detectable and recognisable on the device. + auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); + /// const cast added as a naive solution to solve the qualifier drop error + auto globalid=itemID.get_global_linear_id(); + + if(globalid<rng) + tmp_global_accessor.get_pointer()[globalid]=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*globalid, red_factor, const_cast<Op&>(functor)); + else + tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0); + + if(remaining!=0 && globalid==0 ) + // this will add the rest of input buffer when the input size is not devidable to red_factor. + tmp_global_accessor.get_pointer()[globalid]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*(rng), remaining, const_cast<Op&>(functor)); + }); }); - dev.asynchronousExec(); + dev.m_queue.throw_asynchronous(); - // getting final out buffer at the moment the created buffer is true because there is no need for assign - auto out_buffer =dev.get_sycl_buffer(output); - /// This is used to recursively reduce the tmp value to an element of 1; - syclGenericBufferReducer<Op, CoeffReturnType>::run(reducer, out_buffer, temp_global_buffer,dev, GRange, outTileSize); +/// This is used to recursively reduce the tmp value to an element of 1; + syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize); } }; - template <typename Self, typename Op> struct InnerReducer<Self, Op, const Eigen::SyclDevice> { typedef typename Self::CoeffReturnType CoeffReturnType; static const bool HasOptimizedImplementation = false; - static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index num_values_to_reduce, typename Self::Index num_coeffs_to_preserve) { + static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) { typedef const typename Self::ChildType HostExpr; /// this is the child of reduction - typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr; - FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl()); - typename Self::Index range, GRange, tileSize; - typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims; + typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; + auto functors = TensorSycl::internal::extractFunctors(self.impl()); + + size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2; + size_t GRange=num_coeffs_to_preserve; + if (tileSize>GRange) tileSize=GRange; + else if(GRange>tileSize){ + size_t xMode = GRange % tileSize; + if (xMode != 0) GRange += (tileSize - xMode); + } // getting final out buffer at the moment the created buffer is true because there is no need for assign /// creating the shared memory for calculating reduction. /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can /// recursively apply reduction on it in order to reduce the whole. - dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange); - dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { - // this is workaround for gcc 4.8 bug. - typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc; - // create a tuple of accessors from Evaluator - Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); - auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output); - Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast<Index>(1); - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), - TensorSycl::internal::ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index> - (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size)); + typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims; + Dims dims= self.xprDims(); + Op functor = reducer; + dev.m_queue.submit([&](cl::sycl::handler &cgh) { + // create a tuple of accessors from Evaluator + auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); + auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(num_coeffs_to_preserve,cgh, output); + + cgh.parallel_for<Self>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) { + typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr; + auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); + /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour + /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the + /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here. + const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor); + /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is + /// the device_evaluator is detectable and recognisable on the device. + typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeiceSelf; + auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); + /// const cast added as a naive solution to solve the qualifier drop error + auto globalid=itemID.get_global_linear_id(); + if (globalid< static_cast<size_t>(num_coeffs_to_preserve)) { + typename DeiceSelf::CoeffReturnType accum = functor.initialize(); + GenericDimReducer<DeiceSelf::NumReducedDims-1, DeiceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(globalid),const_cast<Op&>(functor), &accum); + functor.finalize(accum); + output_accessor.get_pointer()[globalid]= accum; + } + }); }); - dev.asynchronousExec(); + dev.m_queue.throw_asynchronous(); return false; } }; diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index e430b08..14e392e 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -224,11 +224,6 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - /// required by sycl in order to extract the accessor - const TensorEvaluator<ArgType, Device> & impl() const { return m_impl; } - /// added for sycl in order to construct the buffer from sycl device - ReverseDimensions functor() const { return m_reverse; } - protected: Dimensions m_dimensions; array<Index, NumDims> m_strides; diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index edc9dd3..113c060 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -117,7 +117,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_shuffle(op.shufflePermutation()) + : m_impl(op.expression(), device) { const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); const Shuffle& shuffle = op.shufflePermutation(); @@ -187,11 +187,6 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - // required by sycl - EIGEN_STRONG_INLINE const Shuffle& shufflePermutation() const {return m_shuffle;} - // required by sycl - EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const {return m_impl;} - protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; @@ -211,12 +206,11 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> return inputIndex + index * m_inputStrides[NumDims - 1]; } } + Dimensions m_dimensions; array<Index, NumDims> m_outputStrides; array<Index, NumDims> m_inputStrides; TensorEvaluator<ArgType, Device> m_impl; - /// required by sycl - Shuffle m_shuffle; }; diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index e6a666f..2854a4a 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -31,12 +31,12 @@ namespace Eigen { * * \sa Tensor */ -template<typename T, typename Dimensions, int Options> class TensorStorage; +template<typename T, typename Dimensions, int Options_> class TensorStorage; // Pure fixed-size storage -template<typename T, typename FixedDimensions, int Options_> -class TensorStorage +template<typename T, int Options_, typename FixedDimensions> +class TensorStorage<T, FixedDimensions, Options_> { private: static const std::size_t Size = FixedDimensions::total_size; @@ -66,7 +66,7 @@ class TensorStorage // pure dynamic -template<typename T, typename IndexType, int NumIndices_, int Options_> +template<typename T, int Options_, typename IndexType, int NumIndices_> class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_> { public: diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 2237140..6c35bfd 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -117,11 +117,11 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_strides(op.strides()) + : m_impl(op.expression(), device) { m_dimensions = m_impl.dimensions(); for (int i = 0; i < NumDims; ++i) { - m_dimensions[i] =Eigen::numext::ceil(static_cast<float>(m_dimensions[i]) / op.strides()[i]); + m_dimensions[i] = ceilf(static_cast<float>(m_dimensions[i]) / op.strides()[i]); } const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); @@ -224,11 +224,6 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - /// required by sycl in order to extract the accessor - const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } - /// required by sycl in order to extract the accessor - Strides functor() const { return m_strides; } - protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { @@ -255,9 +250,9 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> array<Index, NumDims> m_outputStrides; array<Index, NumDims> m_inputStrides; TensorEvaluator<ArgType, Device> m_impl; - const Strides m_strides; }; + // Eval as lvalue template<typename Strides, typename ArgType, typename Device> struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device> @@ -291,11 +286,6 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device> return this->m_impl.coeffRef(this->srcCoeff(index)); } - /// required by sycl in order to extract the accessor - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return this->m_impl; } - /// required by sycl in order to extract the accessor - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Strides functor() const { return this->m_strides; } - template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h index 9d5a6d4..bb8800d 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h @@ -20,14 +20,12 @@ template <class T> struct MakeGlobalPointer { typedef typename cl::sycl::global_ptr<T>::pointer_t Type; - typedef typename cl::sycl::global_ptr<T>::reference_t RefType; }; // global pointer to set different attribute state for a class template <class T> struct MakeLocalPointer { typedef typename cl::sycl::local_ptr<T>::pointer_t Type; - typedef typename cl::sycl::local_ptr<T>::reference_t RefType; }; @@ -35,9 +33,6 @@ namespace Eigen { namespace TensorSycl { namespace internal { - template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer; - - /// This struct is used for special expression nodes with no operations (for example assign and selectOP). struct NoOP; @@ -80,15 +75,8 @@ template<typename T> struct GetType<false, T>{ /// this is used for extracting tensor reduction #include "TensorReductionSycl.h" -/// this is used for extracting tensor convolution -#include "TensorConvolutionSycl.h" - // kernel execution using fusion #include "TensorSyclRun.h" -//sycl functors -#include "TensorSyclFunctors.h" - -#include "TensorContractionSycl.h" #endif // end of EIGEN_USE_SYCL #endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h index ee8f3c9..8729c86 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h @@ -48,9 +48,9 @@ struct DeviceConvertor{ /// specialisation of the \ref ConvertToDeviceExpression struct when the node /// type is TensorMap #define TENSORMAPCONVERT(CVQual)\ -template <typename T, int Options_, template <class> class MakePointer_>\ -struct ConvertToDeviceExpression<CVQual TensorMap<T, Options_, MakePointer_> > {\ - typedef CVQual TensorMap<T, Options_, MakeGlobalPointer> Type;\ +template <typename Scalar_, int Options_, int Options2_, int NumIndices_, typename IndexType_, template <class> class MakePointer_>\ +struct ConvertToDeviceExpression<CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakePointer_> > {\ + typedef CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakeGlobalPointer> Type;\ }; TENSORMAPCONVERT(const) @@ -97,18 +97,8 @@ template <typename Expr>\ struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > \ : DeviceConvertor<ExprNode, Res, Expr>{}; -/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorForcedEvalOp -#define KERNELBROKERCONVERTFORCEDEVAL(CVQual)\ -template <typename Expr>\ -struct ConvertToDeviceExpression<CVQual TensorForcedEvalOp<Expr> > {\ - typedef CVQual TensorForcedEvalOp< typename ConvertToDeviceExpression<Expr>::Type> Type;\ -}; -KERNELBROKERCONVERTFORCEDEVAL(const) -KERNELBROKERCONVERTFORCEDEVAL() -#undef KERNELBROKERCONVERTFORCEDEVAL - - - +KERNELBROKERCONVERT(const, true, TensorForcedEvalOp) +KERNELBROKERCONVERT(, false, TensorForcedEvalOp) KERNELBROKERCONVERT(const, true, TensorEvalToOp) KERNELBROKERCONVERT(, false, TensorEvalToOp) #undef KERNELBROKERCONVERT @@ -124,40 +114,6 @@ KERNELBROKERCONVERTREDUCTION(const) KERNELBROKERCONVERTREDUCTION() #undef KERNELBROKERCONVERTREDUCTION -#define KERNELBROKERCONVERTSLICEOP(CVQual)\ -template<typename StartIndices, typename Sizes, typename XprType>\ -struct ConvertToDeviceExpression<CVQual TensorSlicingOp <StartIndices, Sizes, XprType> >{\ - typedef CVQual TensorSlicingOp<StartIndices, Sizes, typename ConvertToDeviceExpression<XprType>::Type> Type;\ -}; - -KERNELBROKERCONVERTSLICEOP(const) -KERNELBROKERCONVERTSLICEOP() -#undef KERNELBROKERCONVERTSLICEOP - - -#define KERNELBROKERCONVERTERSLICESTRIDEOP(CVQual)\ -template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>\ -struct ConvertToDeviceExpression<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >{\ - typedef CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, typename ConvertToDeviceExpression<XprType>::Type> Type;\ -}; - -KERNELBROKERCONVERTERSLICESTRIDEOP(const) -KERNELBROKERCONVERTERSLICESTRIDEOP() -#undef KERNELBROKERCONVERTERSLICESTRIDEOP - - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorChippingOp -#define KERNELBROKERCONVERTCHIPPINGOP(CVQual)\ -template <DenseIndex DimId, typename Expr>\ -struct ConvertToDeviceExpression<CVQual TensorChippingOp<DimId, Expr> > {\ - typedef CVQual TensorChippingOp<DimId, typename ConvertToDeviceExpression<Expr>::Type> Type;\ -}; -KERNELBROKERCONVERTCHIPPINGOP(const) -KERNELBROKERCONVERTCHIPPINGOP() -#undef KERNELBROKERCONVERTCHIPPINGOP - - - } // namespace internal } // namespace TensorSycl } // namespace Eigen diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h index 3b83b1d..7ed3a3a 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h @@ -25,21 +25,12 @@ namespace Eigen { namespace TensorSycl { namespace internal { - -template <typename Expr, typename Dims> -struct DeviceFixedSizeTensor; - -template <typename Expr, typename std::ptrdiff_t... Indices> -struct DeviceFixedSizeTensor<Expr, Eigen::Sizes<Indices...>>{ - template<typename Data> - static EIGEN_ALWAYS_INLINE Expr instantiate(Data& dt) {return Expr(ConvertToActualTypeSycl(typename Expr::Scalar, dt), Indices...);} -}; /// this class is used by EvalToOp in order to create an lhs expression which is /// a pointer from an accessor on device-only buffer template <typename PtrType, size_t N, typename... Params> struct EvalToLHSConstructor { PtrType expr; - EvalToLHSConstructor(const utility::tuple::Tuple<Params...> &t) : expr(ConvertToActualTypeSycl(typename Eigen::internal::remove_all<PtrType>::type, utility::tuple::get<N>(t))) {} + EvalToLHSConstructor(const utility::tuple::Tuple<Params...> &t): expr((&(*(utility::tuple::get<N>(t).get_pointer())))) {} }; /// \struct ExprConstructor is used to reconstruct the expression on the device and @@ -54,39 +45,21 @@ struct ExprConstructor; /// specialisation of the \ref ExprConstructor struct when the node type is /// TensorMap #define TENSORMAP(CVQual)\ -template <typename T, int Options_,\ +template <typename Scalar_, int Options_, int Options2_, int Options3_, int NumIndices_, typename IndexType_,\ template <class> class MakePointer_, size_t N, typename... Params>\ -struct ExprConstructor< CVQual TensorMap<T, Options_, MakeGlobalPointer>,\ -CVQual PlaceHolder<CVQual TensorMap<T, Options_, MakePointer_>, N>, Params...>{\ - typedef CVQual TensorMap<T, Options_, MakeGlobalPointer> Type;\ +struct ExprConstructor< CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakeGlobalPointer>,\ +CVQual PlaceHolder<CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options3_, MakePointer_>, N>, Params...>{\ + typedef CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakeGlobalPointer> Type;\ Type expr;\ template <typename FuncDetector>\ ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\ - : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())){}\ + : expr(Type((&(*(utility::tuple::get<N>(t).get_pointer()))), fd.dimensions())) {}\ }; - TENSORMAP(const) TENSORMAP() #undef TENSORMAP -/// specialisation of the \ref ExprConstructor struct when the node type is -/// TensorMap -#define TENSORMAPFIXEDSIZE(CVQual)\ -template <typename Scalar_, typename Dimensions_, int Options_2, typename IndexType, int Options_,\ -template <class> class MakePointer_, size_t N, typename... Params>\ -struct ExprConstructor< CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakeGlobalPointer>,\ -CVQual PlaceHolder<CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_>, N>, Params...>{\ - typedef CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakeGlobalPointer> Type;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &, const utility::tuple::Tuple<Params...> &t)\ - : expr(DeviceFixedSizeTensor<Type,Dimensions_>::instantiate(utility::tuple::get<N>(t))){}\ -}; -TENSORMAPFIXEDSIZE(const) -TENSORMAPFIXEDSIZE() -#undef TENSORMAPFIXEDSIZE - #define UNARYCATEGORY(CVQual)\ template <template<class, class> class UnaryCategory, typename OP, typename OrigRHSExpr, typename RHSExpr, typename... Params>\ struct ExprConstructor<CVQual UnaryCategory<OP, OrigRHSExpr>, CVQual UnaryCategory<OP, RHSExpr>, Params...> {\ @@ -188,30 +161,8 @@ struct ExprConstructor<CVQual TensorAssignOp<OrigLHSExpr, OrigRHSExpr>, CVQual ASSIGN(const) ASSIGN() #undef ASSIGN - - - - - /// specialisation of the \ref ExprConstructor struct when the node type is - /// const TensorAssignOp - #define CONVERSIONEXPRCONST(CVQual)\ - template <typename OrigNestedExpr, typename ConvertType, typename NestedExpr, typename... Params>\ - struct ExprConstructor<CVQual TensorConversionOp<ConvertType, OrigNestedExpr>, CVQual TensorConversionOp<ConvertType, NestedExpr>, Params...> {\ - typedef ExprConstructor<OrigNestedExpr, NestedExpr, Params...> my_nested_type;\ - typedef CVQual TensorConversionOp<ConvertType, typename my_nested_type::Type> Type;\ - my_nested_type nestedExpr;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ - : nestedExpr(funcD.subExpr, t), expr(nestedExpr.expr) {}\ - }; - - CONVERSIONEXPRCONST(const) - CONVERSIONEXPRCONST() - #undef CONVERSIONEXPRCONST - /// specialisation of the \ref ExprConstructor struct when the node type is -/// TensorEvalToOp /// 0 here is the output number in the buffer +/// TensorEvalToOp #define EVALTO(CVQual)\ template <typename OrigExpr, typename Expr, typename... Params>\ struct ExprConstructor<CVQual TensorEvalToOp<OrigExpr, MakeGlobalPointer>, CVQual TensorEvalToOp<Expr>, Params...> {\ @@ -234,14 +185,14 @@ EVALTO() /// TensorForcedEvalOp #define FORCEDEVAL(CVQual)\ template <typename OrigExpr, typename DevExpr, size_t N, typename... Params>\ -struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr>,\ +struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr, MakeGlobalPointer>,\ CVQual PlaceHolder<CVQual TensorForcedEvalOp<DevExpr>, N>, Params...> {\ - typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr>::Scalar,\ - TensorForcedEvalOp<DevExpr>::NumDimensions, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, typename TensorForcedEvalOp<DevExpr>::Index>, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, MakeGlobalPointer> Type;\ + typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::Scalar,\ + TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::NumDimensions, 0, typename TensorForcedEvalOp<DevExpr>::Index>, 0, MakeGlobalPointer> Type;\ Type expr;\ template <typename FuncDetector>\ ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\ - : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\ + : expr(Type((&(*(utility::tuple::get<N>(t).get_pointer()))), fd.dimensions())) {}\ }; FORCEDEVAL(const) @@ -262,130 +213,17 @@ struct ExprConstructor<CVQual TensorReductionOp<OP, Dim, OrigExpr, MakeGlobalPoi CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dim, DevExpr>, N>, Params...> {\ static const size_t NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0, 1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\ typedef CVQual TensorMap<Tensor<typename TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::Scalar,\ - NumIndices, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, typename TensorReductionOp<OP, Dim, DevExpr>::Index>, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, MakeGlobalPointer> Type;\ + NumIndices, 0, typename TensorReductionOp<OP, Dim, DevExpr>::Index>, 0, MakeGlobalPointer> Type;\ Type expr;\ template <typename FuncDetector>\ ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\ - :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\ + : expr(Type((&(*(utility::tuple::get<N>(t).get_pointer()))), fd.dimensions())) {}\ }; SYCLREDUCTIONEXPR(const) SYCLREDUCTIONEXPR() #undef SYCLREDUCTIONEXPR - -/// specialisation of the \ref ExprConstructor struct when the node type is -/// TensorContractionOp -#define SYCLCONTRACTIONCONVOLUTION(CVQual, ExprNode)\ -template <typename Indices, typename OrigLhsXprType, typename OrigRhsXprType, typename LhsXprType, typename RhsXprType, size_t N, typename... Params>\ -struct ExprConstructor<CVQual ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>,\ -CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>, Params...> {\ - static const size_t NumIndices= Eigen::internal::traits<ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> >::NumDimensions;\ - typedef CVQual TensorMap<Tensor<typename ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>::Scalar,\ - NumIndices, Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType> >::Layout,\ - typename ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>::Index>,\ - Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>>::Layout, MakeGlobalPointer> Type;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\ - :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\ -}; - -SYCLCONTRACTIONCONVOLUTION(const, TensorContractionOp) -SYCLCONTRACTIONCONVOLUTION(, TensorContractionOp) -SYCLCONTRACTIONCONVOLUTION(const, TensorConvolutionOp) -SYCLCONTRACTIONCONVOLUTION(, TensorConvolutionOp) -#undef SYCLCONTRACTIONCONVOLUTION - - - -#define SYCLSLICEOPEXPR(CVQual)\ -template<typename StartIndices, typename Sizes, typename OrigXprType, typename XprType, typename... Params>\ -struct ExprConstructor<CVQual TensorSlicingOp <StartIndices, Sizes, OrigXprType> , CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Params... >{\ - typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\ - typedef CVQual TensorSlicingOp<StartIndices, Sizes, typename my_xpr_type::Type> Type;\ - my_xpr_type xprExpr;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ - : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.startIndices(), funcD.dimensions()) {}\ -}; - -SYCLSLICEOPEXPR(const) -SYCLSLICEOPEXPR() -#undef SYCLSLICEOPEXPR - - -#define SYCLSLICESTRIDEOPEXPR(CVQual)\ -template<typename StartIndices, typename StopIndices, typename Strides, typename OrigXprType, typename XprType, typename... Params>\ -struct ExprConstructor<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, OrigXprType>, CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Params... >{\ - typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\ - typedef CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, typename my_xpr_type::Type> Type;\ - my_xpr_type xprExpr;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ - : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.startIndices(), funcD.stopIndices(),funcD.strides()) {}\ -}; - -SYCLSLICESTRIDEOPEXPR(const) -SYCLSLICESTRIDEOPEXPR() -#undef SYCLSLICESTRIDEOPEXPR - -#define SYCLRESHAPEANDSHUFFLEOPEXPRCONST(OPEXPR, CVQual)\ -template<typename Param, typename OrigXprType, typename XprType, typename... Params>\ -struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\ - typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\ - typedef CVQual OPEXPR <Param, typename my_xpr_type::Type> Type ;\ - my_xpr_type xprExpr;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ - : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param()) {}\ -}; - -SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, const) -SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, ) - -SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, const) -SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, ) -#undef SYCLRESHAPEANDSHUFFLEOPEXPRCONST - -#define SYCLPADDINGOPEXPRCONST(OPEXPR, CVQual)\ -template<typename Param, typename OrigXprType, typename XprType, typename... Params>\ -struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\ - typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\ - typedef CVQual OPEXPR <Param, typename my_xpr_type::Type> Type ;\ - my_xpr_type xprExpr;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ - : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param() , funcD.scalar_param()) {}\ -}; - -SYCLPADDINGOPEXPRCONST(TensorPaddingOp, const) -SYCLPADDINGOPEXPRCONST(TensorPaddingOp, ) -#undef SYCLPADDINGOPEXPRCONST - - -// TensorChippingOp -#define SYCLTENSORCHIPPINGOPEXPR(CVQual)\ -template<DenseIndex DimId, typename OrigXprType, typename XprType, typename... Params>\ -struct ExprConstructor<CVQual TensorChippingOp <DimId, OrigXprType> , CVQual TensorChippingOp<DimId, XprType>, Params... >{\ - typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\ - typedef CVQual TensorChippingOp<DimId, typename my_xpr_type::Type> Type;\ - my_xpr_type xprExpr;\ - Type expr;\ - template <typename FuncDetector>\ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\ - : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.offset(), funcD.dimId()) {}\ -}; - -SYCLTENSORCHIPPINGOPEXPR(const) -SYCLTENSORCHIPPINGOPEXPR() -#undef SYCLTENSORCHIPPINGOPEXPR - - /// template deduction for \ref ExprConstructor struct template <typename OrigExpr, typename IndexExpr, typename FuncD, typename... Params> auto createDeviceExpression(FuncD &funcD, const utility::tuple::Tuple<Params...> &t) diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h index b512d43..b1da685 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h @@ -35,8 +35,6 @@ namespace Eigen { namespace TensorSycl { namespace internal { -#define RETURN_CPP11(expr) ->decltype(expr) {return expr;} - /// \struct ExtractAccessor: Extract Accessor Class is used to extract the /// accessor from a buffer. /// Depending on the type of the leaf node we can get a read accessor or a @@ -45,192 +43,159 @@ template <typename Evaluator> struct ExtractAccessor; struct AccessorConstructor{ - template<typename Arg> static inline auto getTuple(cl::sycl::handler& cgh, const Arg& eval) - RETURN_CPP11(ExtractAccessor<Arg>::getTuple(cgh, eval)) - - template<typename Arg1, typename Arg2> static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1, const Arg2& eval2) - RETURN_CPP11(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2))) - - template<typename Arg1, typename Arg2, typename Arg3> static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1 , const Arg2& eval2 , const Arg3& eval3) - RETURN_CPP11(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)))) - - template< cl::sycl::access::mode AcM, typename Arg> static inline auto getAccessor(cl::sycl::handler& cgh, const Arg& eval) - RETURN_CPP11(utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM>(cgh,eval.data()))) + template<typename Arg> static inline auto getTuple(cl::sycl::handler& cgh, Arg eval) + -> decltype(ExtractAccessor<Arg>::getTuple(cgh, eval)) { + return ExtractAccessor<Arg>::getTuple(cgh, eval); + } + + template<typename Arg1, typename Arg2> static inline auto getTuple(cl::sycl::handler& cgh, Arg1 eval1, Arg2 eval2) + -> decltype(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2))) { + return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2)); + } + template<typename Arg1, typename Arg2, typename Arg3> static inline auto getTuple(cl::sycl::handler& cgh, Arg1 eval1 , Arg2 eval2 , Arg3 eval3) + -> decltype(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)))) { + return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3))); + } + template< cl::sycl::access::mode AcM, typename Arg> static inline auto getAccessor(cl::sycl::handler& cgh, Arg eval) + -> decltype(utility::tuple::make_tuple( eval.device().template get_sycl_accessor<AcM, + typename Eigen::internal::remove_all<typename Arg::CoeffReturnType>::type>(eval.dimensions().TotalSize(), cgh,eval.data()))){ + return utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM, typename Eigen::internal::remove_all<typename Arg::CoeffReturnType>::type>(eval.dimensions().TotalSize(), cgh,eval.data())); + } }; /// specialisation of the \ref ExtractAccessor struct when the node type is -/// TensorCwiseNullaryOp, TensorCwiseUnaryOp and TensorBroadcastingOp -#define SYCLUNARYCATEGORYEXTACC(CVQual)\ -template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev>& eval)\ -RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ +/// const TensorCwiseNullaryOp, const TensorCwiseUnaryOp and const TensorBroadcastingOp +template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev> +struct ExtractAccessor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> > { + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> eval) + -> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){ + return AccessorConstructor::getTuple(cgh, eval.impl()); + } }; -SYCLUNARYCATEGORYEXTACC(const) -SYCLUNARYCATEGORYEXTACC() -#undef SYCLUNARYCATEGORYEXTACC - - -/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseBinaryOp -#define SYCLBINARYCATEGORYEXTACC(CVQual)\ -template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\ +/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseNullaryOp, TensorCwiseUnaryOp and TensorBroadcastingOp +template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev> +struct ExtractAccessor<TensorEvaluator<UnaryCategory<OP, RHSExpr>, Dev> > +: ExtractAccessor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> > {}; + +/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorCwiseBinaryOp +template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev> +struct ExtractAccessor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > { + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> eval) + -> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){ + return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()); + } }; - -SYCLBINARYCATEGORYEXTACC(const) -SYCLBINARYCATEGORYEXTACC() -#undef SYCLBINARYCATEGORYEXTACC +/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseBinaryOp +template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev> +struct ExtractAccessor<TensorEvaluator<BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > +: ExtractAccessor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> >{}; /// specialisation of the \ref ExtractAccessor struct when the node type is /// const TensorCwiseTernaryOp -#define SYCLTERNARYCATEGORYEXTACC(CVQual)\ -template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl()))\ +template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev> +struct ExtractAccessor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > { + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> eval) + -> decltype(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl())){ + return AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl()); + } }; -SYCLTERNARYCATEGORYEXTACC(const) -SYCLTERNARYCATEGORYEXTACC() -#undef SYCLTERNARYCATEGORYEXTACC +/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseTernaryOp +template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev> +struct ExtractAccessor<TensorEvaluator<TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > +: ExtractAccessor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >{}; +/// specialisation of the \ref ExtractAccessor struct when the node type is +/// const TensorCwiseSelectOp. This is a special case where there is no OP +template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev> +struct ExtractAccessor<TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > { + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> eval) + -> decltype(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl())){ + return AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl()); + } +}; /// specialisation of the \ref ExtractAccessor struct when the node type is /// TensorCwiseSelectOp. This is a special case where there is no OP -#define SYCLSELECTOPEXTACC(CVQual)\ -template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl()))\ +template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev> +struct ExtractAccessor<TensorEvaluator<TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > +: ExtractAccessor<TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> >{}; + +/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorAssignOp +template <typename LHSExpr, typename RHSExpr, typename Dev> +struct ExtractAccessor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> > { + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> eval) + -> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){ + return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()); + } }; -SYCLSELECTOPEXTACC(const) -SYCLSELECTOPEXTACC() -#undef SYCLSELECTOPEXTACC - /// specialisation of the \ref ExtractAccessor struct when the node type is TensorAssignOp -#define SYCLTENSORASSIGNOPEXTACC(CVQual)\ -template <typename LHSExpr, typename RHSExpr, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\ -}; - - SYCLTENSORASSIGNOPEXTACC(const) - SYCLTENSORASSIGNOPEXTACC() - #undef SYCLTENSORASSIGNOPEXTACC +template <typename LHSExpr, typename RHSExpr, typename Dev> +struct ExtractAccessor<TensorEvaluator<TensorAssignOp<LHSExpr, RHSExpr>, Dev> > +: ExtractAccessor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> >{}; /// specialisation of the \ref ExtractAccessor struct when the node type is const TensorMap #define TENSORMAPEXPR(CVQual, ACCType)\ template <typename PlainObjectType, int Options_, typename Dev>\ struct ExtractAccessor<TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::template getAccessor<ACCType>(cgh, eval))\ + static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev> eval)\ + -> decltype(AccessorConstructor::template getAccessor<ACCType>(cgh, eval)){\ + return AccessorConstructor::template getAccessor<ACCType>(cgh, eval);\ + }\ }; - TENSORMAPEXPR(const, cl::sycl::access::mode::read) TENSORMAPEXPR(, cl::sycl::access::mode::read_write) #undef TENSORMAPEXPR -/// specialisation of the \ref ExtractAccessor struct when the node type is TensorForcedEvalOp -#define SYCLFORCEDEVALEXTACC(CVQual)\ -template <typename Expr, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual TensorForcedEvalOp<Expr>, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorForcedEvalOp<Expr>, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\ +/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorForcedEvalOp +template <typename Expr, typename Dev> +struct ExtractAccessor<TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> > { + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> eval) + -> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){ + return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval); + } }; -SYCLFORCEDEVALEXTACC(const) -SYCLFORCEDEVALEXTACC() -#undef SYCLFORCEDEVALEXTACC - +/// specialisation of the \ref ExtractAccessor struct when the node type is TensorForcedEvalOp +template <typename Expr, typename Dev> +struct ExtractAccessor<TensorEvaluator<TensorForcedEvalOp<Expr>, Dev> > +: ExtractAccessor<TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> >{}; + +/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorEvalToOp +template <typename Expr, typename Dev> +struct ExtractAccessor<TensorEvaluator<const TensorEvalToOp<Expr>, Dev> > { + static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<const TensorEvalToOp<Expr>, Dev> eval) + -> decltype(utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()))){ + return utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl())); + } +}; /// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp -#define SYCLEVALTOEXTACC(CVQual)\ -template <typename Expr, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual TensorEvalToOp<Expr>, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorEvalToOp<Expr>, Dev>& eval)\ - RETURN_CPP11(utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl())))\ +template <typename Expr, typename Dev> +struct ExtractAccessor<TensorEvaluator<TensorEvalToOp<Expr>, Dev> > +: ExtractAccessor<TensorEvaluator<const TensorEvalToOp<Expr>, Dev> >{}; + +/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorReductionOp +template <typename OP, typename Dim, typename Expr, typename Dev> +struct ExtractAccessor<TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> > { + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> eval) + -> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){ + return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval); + } }; -SYCLEVALTOEXTACC(const) -SYCLEVALTOEXTACC() -#undef SYCLEVALTOEXTACC - /// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp -#define SYCLREDUCTIONEXTACC(CVQual)\ -template <typename OP, typename Dim, typename Expr, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\ -}; - -SYCLREDUCTIONEXTACC(const) -SYCLREDUCTIONEXTACC() -#undef SYCLREDUCTIONEXTACC - -/// specialisation of the \ref ExtractAccessor struct when the node type is TensorContractionOp and TensorConvolutionOp -#define SYCLCONTRACTIONCONVOLUTIONEXTACC(CVQual, ExprNode)\ -template<typename Indices, typename LhsXprType, typename RhsXprType, typename Dev>\ - struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\ -}; - -SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorContractionOp) -SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorContractionOp) -SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorConvolutionOp) -SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorConvolutionOp) -#undef SYCLCONTRACTIONCONVOLUTIONEXTACC - - -/// specialisation of the \ref ExtractAccessor struct when the node type is -/// const TensorSlicingOp. -#define SYCLSLICEOPEXTACC(CVQual)\ -template <typename StartIndices, typename Sizes, typename XprType, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev>& eval)\ - RETURN_CPP11( AccessorConstructor::getTuple(cgh, eval.impl()))\ -}; - -SYCLSLICEOPEXTACC(const) -SYCLSLICEOPEXTACC() -#undef SYCLSLICEOPEXTACC -// specialisation of the \ref ExtractAccessor struct when the node type is -/// TensorStridingSlicingOp. -#define SYCLSLICESTRIDEOPEXTACC(CVQual)\ -template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev> >{\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ -}; - -SYCLSLICESTRIDEOPEXTACC(const) -SYCLSLICESTRIDEOPEXTACC() -#undef SYCLSLICESTRIDEOPEXTACC - -// specialisation of the \ref ExtractAccessor struct when the node type is -/// TensorChippingOp. -#define SYCLTENSORCHIPPINGOPEXTACC(CVQual)\ -template<DenseIndex DimId, typename XprType, typename Dev>\ -struct ExtractAccessor<TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Dev> >{\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ -}; - -SYCLTENSORCHIPPINGOPEXTACC(const) -SYCLTENSORCHIPPINGOPEXTACC() -#undef SYCLTENSORCHIPPINGOPEXTACC - +template <typename OP, typename Dim, typename Expr, typename Dev> +struct ExtractAccessor<TensorEvaluator<TensorReductionOp<OP, Dim, Expr>, Dev> > +: ExtractAccessor<TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> >{}; /// template deduction for \ref ExtractAccessor template <typename Evaluator> -auto createTupleOfAccessors(cl::sycl::handler& cgh, const Evaluator& eval) --> decltype(ExtractAccessor<Evaluator>::getTuple(cgh, eval)) { - return ExtractAccessor<Evaluator>::getTuple(cgh, eval); +auto createTupleOfAccessors(cl::sycl::handler& cgh, const Evaluator& expr) +-> decltype(ExtractAccessor<Evaluator>::getTuple(cgh, expr)) { + return ExtractAccessor<Evaluator>::getTuple(cgh, expr); } } /// namespace TensorSycl diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h index ee02018..4271253 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h @@ -36,277 +36,135 @@ namespace internal { template <typename Evaluator> struct FunctorExtractor{ typedef typename Evaluator::Dimensions Dimensions; const Dimensions m_dimensions; - EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + const Dimensions& dimensions() const { return m_dimensions; } FunctorExtractor(const Evaluator& expr) : m_dimensions(expr.dimensions()) {} }; -/// specialisation of the \ref FunctorExtractor struct when the node type does not require anything -///TensorConversionOp -#define SYCLEXTRFUNCCONVERSION(ExprNode, CVQual)\ -template <typename ArgType1, typename ArgType2, typename Dev>\ -struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<ArgType1, ArgType2>, Dev> > {\ - FunctorExtractor<TensorEvaluator<ArgType2, Dev> > subExpr;\ - FunctorExtractor(const TensorEvaluator<CVQual ExprNode<ArgType1, ArgType2>, Dev>& expr)\ - : subExpr(expr.impl()) {}\ +/// specialisation of the \ref FunctorExtractor struct when the node type is +/// const TensorCwiseNullaryOp, const TensorCwiseUnaryOp, and const TensorBroadcastingOp +template <template <class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev> +struct FunctorExtractor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> > { + FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr; + OP func; + FunctorExtractor(const TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev>& expr) + : rhsExpr(expr.impl()), func(expr.functor()) {} }; +/// specialisation of the \ref FunctorExtractor struct when the node type is +/// TensorCwiseNullaryOp, TensorCwiseUnaryOp, and TensorBroadcastingOp +template <template <class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev> +struct FunctorExtractor<TensorEvaluator<UnaryCategory<OP, RHSExpr>, Dev> > +: FunctorExtractor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> >{}; -SYCLEXTRFUNCCONVERSION(TensorConversionOp, const) -SYCLEXTRFUNCCONVERSION(TensorConversionOp, ) -#undef SYCLEXTRFUNCCONVERSION - -#define SYCLEXTRTENSORMAPFIXEDSIZE(CVQual)\ -template <typename Scalar_, typename Dimensions_, int Options_2, typename IndexType, int Options_, template <class> class MakePointer_, typename Dev>\ -struct FunctorExtractor< TensorEvaluator <CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_> , Dev> >{\ -FunctorExtractor(const TensorEvaluator <CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_> , Dev>& ){}\ +/// specialisation of the \ref FunctorExtractor struct when the node type is +/// const TensorCwiseBinaryOp +template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev> +struct FunctorExtractor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > { + FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr; + FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr; + OP func; + FunctorExtractor(const TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& expr) + : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.functor()) {} }; -SYCLEXTRTENSORMAPFIXEDSIZE(const) -SYCLEXTRTENSORMAPFIXEDSIZE() -#undef SYCLEXTRTENSORMAPFIXEDSIZE +/// specialisation of the \ref FunctorExtractor struct when the node type is +/// const TensorCwiseBinaryOp +template <template <class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev> +struct FunctorExtractor<TensorEvaluator<BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > +: FunctorExtractor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> >{}; /// specialisation of the \ref FunctorExtractor struct when the node type is -/// TensorCwiseNullaryOp, TensorCwiseUnaryOp, and TensorBroadcastingOp -#define SYCLEXTRFUNCUNARY(CVQual)\ -template <template <class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>\ -struct FunctorExtractor<TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev> > {\ - FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\ - const OP func;\ - FunctorExtractor(const TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev>& expr)\ - : rhsExpr(expr.impl()), func(expr.functor()) {}\ +/// const TensorCwiseTernaryOp +template <template <class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr,typename Dev> +struct FunctorExtractor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > { + FunctorExtractor<TensorEvaluator<Arg1Expr, Dev> > arg1Expr; + FunctorExtractor<TensorEvaluator<Arg2Expr, Dev> > arg2Expr; + FunctorExtractor<TensorEvaluator<Arg3Expr, Dev> > arg3Expr; + OP func; + FunctorExtractor(const TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& expr) + : arg1Expr(expr.arg1Impl()), arg2Expr(expr.arg2Impl()), arg3Expr(expr.arg3Impl()), func(expr.functor()) {} }; -SYCLEXTRFUNCUNARY(const) -SYCLEXTRFUNCUNARY() -#undef SYCLEXTRFUNCUNARY +/// specialisation of the \ref FunctorExtractor struct when the node type is +/// TensorCwiseTernaryOp +template <template <class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev> +struct FunctorExtractor<TensorEvaluator< TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > +:FunctorExtractor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >{}; /// specialisation of the \ref FunctorExtractor struct when the node type is -/// TensorCwiseBinaryOp -#define SYCLEXTRFUNCBIINARY(CVQual)\ -template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>\ -struct FunctorExtractor<TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {\ - FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\ - FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\ - const OP func;\ - FunctorExtractor(const TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& expr)\ - : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.functor()) {}\ +/// const TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated. +template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev> +struct FunctorExtractor< TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > { + FunctorExtractor<TensorEvaluator<IfExpr, Dev> > ifExpr; + FunctorExtractor<TensorEvaluator<ThenExpr, Dev> > thenExpr; + FunctorExtractor<TensorEvaluator<ElseExpr, Dev> > elseExpr; + FunctorExtractor(const TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& expr) + : ifExpr(expr.cond_impl()), thenExpr(expr.then_impl()), elseExpr(expr.else_impl()) {} }; -SYCLEXTRFUNCBIINARY(const) -SYCLEXTRFUNCBIINARY() -#undef SYCLEXTRFUNCBIINARY +/// specialisation of the \ref FunctorExtractor struct when the node type is +/// TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated +template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev> +struct FunctorExtractor<TensorEvaluator<TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > +:FunctorExtractor< TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {}; -/// specialisation of the \ref FunctorExtractor struct when the node type is TensorCwiseTernaryOp -#define SYCLEXTRFUNCTERNARY(CVQual)\ -template <template <class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr,typename Dev>\ -struct FunctorExtractor<TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {\ - FunctorExtractor<TensorEvaluator<Arg1Expr, Dev> > arg1Expr;\ - FunctorExtractor<TensorEvaluator<Arg2Expr, Dev> > arg2Expr;\ - FunctorExtractor<TensorEvaluator<Arg3Expr, Dev> > arg3Expr;\ - const OP func;\ - FunctorExtractor(const TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& expr)\ - : arg1Expr(expr.arg1Impl()), arg2Expr(expr.arg2Impl()), arg3Expr(expr.arg3Impl()), func(expr.functor()) {}\ +/// specialisation of the \ref FunctorExtractor struct when the node type is +/// const TensorAssignOp. This is an specialisation without OP so it has to be separated. +template <typename LHSExpr, typename RHSExpr, typename Dev> +struct FunctorExtractor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> > { + FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr; + FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr; + FunctorExtractor(const TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev>& expr) + : lhsExpr(expr.left_impl()), rhsExpr(expr.right_impl()) {} }; -SYCLEXTRFUNCTERNARY(const) -SYCLEXTRFUNCTERNARY() -#undef SYCLEXTRFUNCTERNARY - /// specialisation of the \ref FunctorExtractor struct when the node type is -/// TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated. -#define SYCLEXTRFUNCSELECTOP(CVQual)\ -template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>\ -struct FunctorExtractor< TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {\ - FunctorExtractor<TensorEvaluator<IfExpr, Dev> > ifExpr;\ - FunctorExtractor<TensorEvaluator<ThenExpr, Dev> > thenExpr;\ - FunctorExtractor<TensorEvaluator<ElseExpr, Dev> > elseExpr;\ - FunctorExtractor(const TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& expr)\ - : ifExpr(expr.cond_impl()), thenExpr(expr.then_impl()), elseExpr(expr.else_impl()) {}\ -}; +/// TensorAssignOp. This is an specialisation without OP so it has to be separated. +template <typename LHSExpr, typename RHSExpr, typename Dev> +struct FunctorExtractor<TensorEvaluator<TensorAssignOp<LHSExpr, RHSExpr>, Dev> > +:FunctorExtractor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> >{}; -SYCLEXTRFUNCSELECTOP(const) -SYCLEXTRFUNCSELECTOP() -#undef SYCLEXTRFUNCSELECTOP /// specialisation of the \ref FunctorExtractor struct when the node type is -/// const TensorAssignOp. This is an specialisation without OP so it has to be separated. -#define SYCLEXTRFUNCASSIGNOP(CVQual)\ -template <typename LHSExpr, typename RHSExpr, typename Dev>\ -struct FunctorExtractor<TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {\ - FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\ - FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\ - FunctorExtractor(const TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev>& expr)\ - : lhsExpr(expr.left_impl()), rhsExpr(expr.right_impl()) {}\ +/// const TensorEvalToOp, This is an specialisation without OP so it has to be separated. +template <typename RHSExpr, typename Dev> +struct FunctorExtractor<TensorEvaluator<const TensorEvalToOp<RHSExpr>, Dev> > { + FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr; + FunctorExtractor(const TensorEvaluator<const TensorEvalToOp<RHSExpr>, Dev>& expr) + : rhsExpr(expr.impl()) {} }; -SYCLEXTRFUNCASSIGNOP(const) -SYCLEXTRFUNCASSIGNOP() -#undef SYCLEXTRFUNCASSIGNOP /// specialisation of the \ref FunctorExtractor struct when the node type is -/// TensorEvalToOp, This is an specialisation without OP so it has to be separated. -#define SYCLEXTRFUNCEVALTOOP(CVQual)\ -template <typename RHSExpr, typename Dev>\ -struct FunctorExtractor<TensorEvaluator<CVQual TensorEvalToOp<RHSExpr>, Dev> > {\ - FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\ - FunctorExtractor(const TensorEvaluator<CVQual TensorEvalToOp<RHSExpr>, Dev>& expr)\ - : rhsExpr(expr.impl()) {}\ -}; - -SYCLEXTRFUNCEVALTOOP(const) -SYCLEXTRFUNCEVALTOOP() -#undef SYCLEXTRFUNCEVALTOOP +/// TensorEvalToOp. This is a specialisation without OP so it has to be separated. +template <typename RHSExpr, typename Dev> +struct FunctorExtractor<TensorEvaluator<TensorEvalToOp<RHSExpr>, Dev> > +: FunctorExtractor<TensorEvaluator<const TensorEvalToOp<RHSExpr>, Dev> > {}; template<typename Dim, size_t NumOutputDim> struct DimConstr { template<typename InDim> - static EIGEN_STRONG_INLINE Dim getDim(InDim dims ) {return dims;} + static inline Dim getDim(InDim dims ) {return dims;} }; template<typename Dim> struct DimConstr<Dim, 0> { template<typename InDim> - static EIGEN_STRONG_INLINE Dim getDim(InDim dims ) {return Dim(static_cast<Dim>(dims.TotalSize()));} -}; - -#define SYCLEXTRFUNCREDUCTIONOP(CVQual)\ -template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>\ -struct FunctorExtractor<TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{\ - typedef TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Evaluator;\ - typedef typename Eigen::internal::conditional<Evaluator::NumOutputDims==0, DSizes<typename Evaluator::Index, 1>, typename Evaluator::Dimensions >::type Dimensions;\ - const Dimensions m_dimensions;\ - EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\ - FunctorExtractor(const TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>& expr)\ - : m_dimensions(DimConstr<Dimensions, Evaluator::NumOutputDims>::getDim(expr.dimensions())) {}\ -}; - - -SYCLEXTRFUNCREDUCTIONOP(const) -SYCLEXTRFUNCREDUCTIONOP() -#undef SYCLEXTRFUNCREDUCTIONOP - -#define SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(CVQual, ExprNode)\ -template<typename Indices, typename LhsXprType, typename RhsXprType, typename Device>\ -struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>>{\ - typedef TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device> Evaluator;\ - typedef typename Evaluator::Dimensions Dimensions;\ - const Dimensions m_dimensions;\ - EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\ - FunctorExtractor(const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>& expr)\ - : m_dimensions(expr.dimensions()) {}\ + static inline Dim getDim(InDim dims ) {return Dim(dims.TotalSize());} }; - -SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorContractionOp) -SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorContractionOp) -SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorConvolutionOp) -SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorConvolutionOp) -#undef SYCLEXTRFUNCCONTRACTCONVOLUTIONOP - -/// specialisation of the \ref FunctorExtractor struct when the node type is -/// const TensorSlicingOp. This is an specialisation without OP so it has to be separated. -#define SYCLEXTRFUNCTSLICEOP(CVQual)\ -template <typename StartIndices, typename Sizes, typename XprType, typename Dev>\ -struct FunctorExtractor<TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev> > {\ - FunctorExtractor<TensorEvaluator<XprType, Dev> > xprExpr;\ - const StartIndices m_offsets;\ - const Sizes m_dimensions;\ - FunctorExtractor(const TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev>& expr)\ - : xprExpr(expr.impl()), m_offsets(expr.startIndices()), m_dimensions(expr.dimensions()) {}\ - EIGEN_STRONG_INLINE const StartIndices& startIndices() const {return m_offsets;}\ - EIGEN_STRONG_INLINE const Sizes& dimensions() const {return m_dimensions;}\ -}; - -SYCLEXTRFUNCTSLICEOP(const) -SYCLEXTRFUNCTSLICEOP() -#undef SYCLEXTRFUNCTSLICEOP - -#define SYCLEXTRFUNCTSLICESTRIDEOP(CVQual)\ -template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, typename Dev>\ -struct FunctorExtractor<TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev> >{\ - FunctorExtractor<TensorEvaluator<XprType, Dev> > xprExpr;\ - const StartIndices m_startIndices;\ - const StopIndices m_stopIndices;\ - const Strides m_strides;\ - FunctorExtractor(const TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices,Strides, XprType>, Dev>& expr)\ - : xprExpr(expr.impl()), m_startIndices(expr.exprStartIndices()), m_stopIndices(expr.exprStopIndices()), m_strides(expr.strides()) {}\ - EIGEN_STRONG_INLINE const StartIndices& startIndices() const { return m_startIndices; }\ - EIGEN_STRONG_INLINE const StartIndices& stopIndices() const { return m_stopIndices; }\ - EIGEN_STRONG_INLINE const StartIndices& strides() const { return m_strides; }\ -}; - -SYCLEXTRFUNCTSLICESTRIDEOP(const) -SYCLEXTRFUNCTSLICESTRIDEOP() -#undef SYCLEXTRFUNCTSLICESTRIDEOP - -// Had to separate reshapeOP otherwise it will be mistaken by UnaryCategory -#define SYCLRESHAPEANDSHUFFLEOPFUNCEXT(OPEXPR, FUNCCALL, CVQual)\ -template<typename Param, typename XprType, typename Dev>\ -struct FunctorExtractor<Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev> > {\ - FunctorExtractor<Eigen::TensorEvaluator<XprType, Dev> > xprExpr;\ - const Param m_param;\ - EIGEN_STRONG_INLINE const Param& param() const { return m_param; }\ - FunctorExtractor(const Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev>& expr)\ - : xprExpr(expr.impl()), m_param(expr.FUNCCALL) {}\ -}; - -SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), const) -SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), ) - -SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), const) -SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), ) -#undef SYCLRESHAPEANDSHUFFLEOPFUNCEXT - -// Had to separate reshapeOP otherwise it will be mistaken by UnaryCategory -#define PADDINGOPFUNCEXT(OPEXPR, FUNCCALL, SCALARFUNCCALL, CVQual)\ -template<typename Param, typename XprType, typename Dev>\ -struct FunctorExtractor<Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev> > {\ - FunctorExtractor<Eigen::TensorEvaluator<XprType, Dev> > xprExpr;\ - const Param m_param;\ - typedef typename Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev>::Scalar Scalar;\ - const Scalar m_scalar_param;\ - EIGEN_STRONG_INLINE const Param& param() const { return m_param; }\ - EIGEN_STRONG_INLINE const Scalar& scalar_param() const { return m_scalar_param; }\ - FunctorExtractor(const Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev>& expr)\ - : xprExpr(expr.impl()), m_param(expr.FUNCCALL), m_scalar_param(expr.SCALARFUNCCALL) {}\ -}; - -PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), const) -PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), ) -#undef PADDINGOPFUNCEXT - -/// specialisation of the \ref FunctorExtractor struct when the node type is TensorContractionOp and TensorConcatenationOp -/// for TensorContractionOp the LHS and RHS here are the original one no need to apply condition on their type. -#define SYCLEXTRFUNCCONTRACTCONCAT(OPEXPR, FUNCCALL, CVQual)\ -template <typename Param, typename LHSExpr, typename RHSExpr, typename Dev>\ -struct FunctorExtractor<TensorEvaluator<CVQual OPEXPR<Param, LHSExpr, RHSExpr>, Dev> > {\ - FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\ - FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\ - const Param func;\ - FunctorExtractor(const TensorEvaluator<CVQual OPEXPR<Param, LHSExpr, RHSExpr>, Dev>& expr)\ - : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.FUNCCALL) {}\ -}; - -// TensorConcatenationOp -SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(), const) -SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(),) -#undef SYCLEXTRFUNCCONTRACTCONCAT - -//TensorChippingOp -#define SYCLEXTRFUNCCHIPPINGOP(CVQual)\ -template<DenseIndex DimId, typename XprType, typename Device>\ -struct FunctorExtractor<TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Device>>{\ - FunctorExtractor<Eigen::TensorEvaluator<XprType, Device> > xprExpr;\ - const DenseIndex m_dim;\ - const DenseIndex m_offset;\ - EIGEN_STRONG_INLINE const DenseIndex& dimId() const { return m_dim; }\ - EIGEN_STRONG_INLINE const DenseIndex& offset() const { return m_offset; }\ - FunctorExtractor(const TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Device>& expr)\ - : xprExpr(expr.impl()), m_dim(expr.dimId()), m_offset(expr.offset()) {}\ +template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device> +struct FunctorExtractor<TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{ + typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Evaluator; + typedef typename Eigen::internal::conditional<Evaluator::NumOutputDims==0, DSizes<typename Evaluator::Index, 1>, typename Evaluator::Dimensions >::type Dimensions; + const Dimensions m_dimensions; + const Dimensions& dimensions() const { return m_dimensions; } + FunctorExtractor(const TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>& expr) + : m_dimensions(DimConstr<Dimensions, Evaluator::NumOutputDims>::getDim(expr.dimensions())) {} }; -SYCLEXTRFUNCCHIPPINGOP(const) -SYCLEXTRFUNCCHIPPINGOP() -#undef SYCLEXTRFUNCCHIPPINGOP +template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device> +struct FunctorExtractor<TensorEvaluator<TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>> +: FunctorExtractor<TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{}; /// template deduction function for FunctorExtractor template <typename Evaluator> auto inline extractFunctors(const Evaluator& evaluator)-> FunctorExtractor<Evaluator> { diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h deleted file mode 100644 index 2f77790..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h +++ /dev/null @@ -1,245 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: eigen@codeplay.com -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -// General include header of SYCL target for Tensor Module -#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H -#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H - -namespace Eigen { -namespace TensorSycl { -namespace internal { - - template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{ - OP op; - OutputAccessor aOut; - InputAccessor aI; - LocalAccessor scratch; - size_t length, local; - GenericKernelReducer(OP op_, OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_) - : op(op_), aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){} - void operator()(cl::sycl::nd_item<1> itemID) { - size_t globalid = itemID.get_global(0); - size_t localid = itemID.get_local(0); - /* All threads collectively read from global memory into local. - * The barrier ensures all threads' IO is resolved before - * execution continues (strictly speaking, all threads within - * a single work-group - there is no co-ordination between - * work-groups, only work-items). */ - if (globalid < length) { - scratch[localid] = aI[globalid]; - } - itemID.barrier(cl::sycl::access::fence_space::local_space); - - /* Apply the reduction operation between the current local - * id and the one on the other half of the vector. */ - if (globalid < length) { - auto min = (length < local) ? length : local; - for (size_t offset = min / 2; offset > 0; offset /= 2) { - if (localid < offset) { - auto accum = op.initialize(); - op.reduce(scratch[localid], &accum); - op.reduce(scratch[localid + offset], &accum); - op.finalize(accum); - scratch[localid]=accum; - //scratch[localid] += scratch[localid + offset]; - } - itemID.barrier(cl::sycl::access::fence_space::local_space); - } - /* The final result will be stored in local id 0. */ - if (localid == 0) { - aI[itemID.get_group(0)] = scratch[localid]; - if((length<=local) && globalid ==0){ - auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut); - aOutPtr[0]=scratch[0]; - } - } - } - } - - }; - -/// ReductionFunctor -template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor { - public: - typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor; - ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index) - :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {} - void operator()(cl::sycl::nd_item<1> itemID) { - - typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr; - auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour - /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the - /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here. - const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor); - /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is - /// the device_evaluator is detectable and recognisable on the device. - typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf; - auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); - auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor); - /// const cast added as a naive solution to solve the qualifier drop error - auto globalid=static_cast<Index>(itemID.get_global_linear_id()); - if (globalid< range) { - typename DeviceSelf::CoeffReturnType accum = functor.initialize(); - Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum); - functor.finalize(accum); - output_accessor_ptr[globalid]= accum; - } - } - private: - write_accessor output_accessor; - FunctorExpr functors; - Tuple_of_Acc tuple_of_accessors; - Dims dims; - Op functor; - Index range; -}; - -template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Index> -class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index> { - public: - typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; - typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor; - typedef Eigen::internal::SumReducer<typename HostExpr::CoeffReturnType> Op; - ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, - Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index range_, Index num_values_to_reduce_) - :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {} - void operator()(cl::sycl::nd_item<1> itemID) { - - typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr; - auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour - /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the - /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here. - const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor); - /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is - /// the device_evaluator is detectable and recognisable on the device. - typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf; - auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); - auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor); - /// const cast added as a naive solution to solve the qualifier drop error - auto globalid=static_cast<Index>(itemID.get_global_linear_id()); - if (globalid< range) { - typename DeviceSelf::CoeffReturnType accum = functor.initialize(); - Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum); - functor.finalize(accum); - output_accessor_ptr[globalid]= accum/num_values_to_reduce; - } - } - private: - write_accessor output_accessor; - FunctorExpr functors; - Tuple_of_Acc tuple_of_accessors; - Dims dims; - Op functor; - Index range; - Index num_values_to_reduce; -}; - -template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Op, typename Dims, typename Index, typename TupleType> -class FullReductionKernelFunctor{ -public: - typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; - OutAccessor tmp_global_accessor; - Index rng , remaining, red_factor; - Op op; - Dims dims; - FunctorExpr functors; - TupleType tuple_of_accessors; - - FullReductionKernelFunctor(OutAccessor acc, Index rng_, Index remaining_, Index red_factor_, Op op_, Dims dims_, FunctorExpr functors_, TupleType t_acc) - :tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(op_), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){} - - void operator()(cl::sycl::nd_item<1> itemID) { - - typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr; - auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour - /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the - /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here. - const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op); - /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is - /// the device_evaluator is detectable and recognisable on the device. - auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); - /// const cast added as a naive solution to solve the qualifier drop error - auto globalid=itemID.get_global_linear_id(); - - tmp_global_accessor.get_pointer()[globalid]=(globalid<rng) ? Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op)) - : static_cast<CoeffReturnType>(op.initialize()); - - if(remaining!=0 && globalid==0 ){ - // this will add the rest of input buffer when the input size is not devidable to red_factor. - auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>:: - reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op)); - auto accum = op.initialize(); - op.reduce(tmp_global_accessor.get_pointer()[0], &accum); - op.reduce(remaining_reduce, &accum); - op.finalize(accum); - tmp_global_accessor.get_pointer()[0]=accum; - - } - } -}; - -template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Dims, typename Index, typename TupleType> -class FullReductionKernelFunctor<CoeffReturnType, OutAccessor, HostExpr, FunctorExpr, Eigen::internal::MeanReducer<CoeffReturnType>, Dims, Index, TupleType>{ -public: - typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr; - typedef Eigen::internal::SumReducer<CoeffReturnType> Op; - - OutAccessor tmp_global_accessor; - Index rng , remaining, red_factor; - Op op; - Dims dims; - FunctorExpr functors; - TupleType tuple_of_accessors; - - FullReductionKernelFunctor(OutAccessor acc, Index rng_, Index remaining_, Index red_factor_, Eigen::internal::MeanReducer<CoeffReturnType>, Dims dims_, FunctorExpr functors_, TupleType t_acc) - :tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(Op()), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){} - - void operator()(cl::sycl::nd_item<1> itemID) { - - typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr; - auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour - /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the - /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here. - const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op); - /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is - /// the device_evaluator is detectable and recognisable on the device. - auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice()); - /// const cast added as a naive solution to solve the qualifier drop error - auto globalid=itemID.get_global_linear_id(); - auto scale = (rng*red_factor) + remaining; - - tmp_global_accessor.get_pointer()[globalid]= (globalid<rng)? ((Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op)))/scale) - :static_cast<CoeffReturnType>(op.initialize())/scale; - - if(remaining!=0 && globalid==0 ){ - // this will add the rest of input buffer when the input size is not devidable to red_factor. - auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op)); - auto accum = op.initialize(); - tmp_global_accessor.get_pointer()[0]= tmp_global_accessor.get_pointer()[0]*scale; - op.reduce(tmp_global_accessor.get_pointer()[0], &accum); - op.reduce(remaining_reduce, &accum); - op.finalize(accum); - tmp_global_accessor.get_pointer()[0]=accum/scale; - - } - } -}; - -} -} -} -#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h index a1c112f..25d1fac 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h @@ -44,120 +44,68 @@ struct CategoryCount<Arg,Args...>{ }; /// specialisation of the \ref LeafCount struct when the node type is const TensorMap -#define SYCLTENSORMAPLEAFCOUNT(CVQual)\ -template <typename PlainObjectType, int Options_, template <class> class MakePointer_>\ -struct LeafCount<CVQual TensorMap<PlainObjectType, Options_, MakePointer_> > {\ - static const size_t Count =1;\ +template <typename PlainObjectType, int Options_, template <class> class MakePointer_> +struct LeafCount<const TensorMap<PlainObjectType, Options_, MakePointer_> > { + static const size_t Count =1; }; -SYCLTENSORMAPLEAFCOUNT(const) -SYCLTENSORMAPLEAFCOUNT() -#undef SYCLTENSORMAPLEAFCOUNT +/// specialisation of the \ref LeafCount struct when the node type is TensorMap +template <typename PlainObjectType, int Options_, template <class> class MakePointer_> +struct LeafCount<TensorMap<PlainObjectType, Options_, MakePointer_> > :LeafCount<const TensorMap<PlainObjectType, Options_, MakePointer_> >{}; -// TensorCwiseUnaryOp, TensorCwiseNullaryOp, TensorCwiseBinaryOp, TensorCwiseTernaryOp, and TensorBroadcastingOp -#define SYCLCATEGORYLEAFCOUNT(CVQual)\ -template <template <class, class...> class CategoryExpr, typename OP, typename... RHSExpr>\ -struct LeafCount<CVQual CategoryExpr<OP, RHSExpr...> >: CategoryCount<RHSExpr...> {}; - -SYCLCATEGORYLEAFCOUNT(const) -SYCLCATEGORYLEAFCOUNT() -#undef SYCLCATEGORYLEAFCOUNT +// const TensorCwiseUnaryOp, const TensorCwiseNullaryOp, const TensorCwiseBinaryOp, const TensorCwiseTernaryOp, and Const TensorBroadcastingOp +template <template <class, class...> class CategoryExpr, typename OP, typename... RHSExpr> +struct LeafCount<const CategoryExpr<OP, RHSExpr...> >: CategoryCount<RHSExpr...> {}; +// TensorCwiseUnaryOp, TensorCwiseNullaryOp, TensorCwiseBinaryOp, TensorCwiseTernaryOp, and TensorBroadcastingOp +template <template <class, class...> class CategoryExpr, typename OP, typename... RHSExpr> +struct LeafCount<CategoryExpr<OP, RHSExpr...> > :LeafCount<const CategoryExpr<OP, RHSExpr...> >{}; /// specialisation of the \ref LeafCount struct when the node type is const TensorSelectOp is an exception -#define SYCLSELECTOPLEAFCOUNT(CVQual)\ -template <typename IfExpr, typename ThenExpr, typename ElseExpr>\ -struct LeafCount<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr> > : CategoryCount<IfExpr, ThenExpr, ElseExpr> {}; - -SYCLSELECTOPLEAFCOUNT(const) -SYCLSELECTOPLEAFCOUNT() -#undef SYCLSELECTOPLEAFCOUNT +template <typename IfExpr, typename ThenExpr, typename ElseExpr> +struct LeafCount<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr> > : CategoryCount<IfExpr, ThenExpr, ElseExpr> {}; +/// specialisation of the \ref LeafCount struct when the node type is TensorSelectOp +template <typename IfExpr, typename ThenExpr, typename ElseExpr> +struct LeafCount<TensorSelectOp<IfExpr, ThenExpr, ElseExpr> >: LeafCount<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr> > {}; -/// specialisation of the \ref LeafCount struct when the node type is TensorAssignOp -#define SYCLLEAFCOUNTASSIGNOP(CVQual)\ -template <typename LHSExpr, typename RHSExpr>\ -struct LeafCount<CVQual TensorAssignOp<LHSExpr, RHSExpr> >: CategoryCount<LHSExpr,RHSExpr> {}; +/// specialisation of the \ref LeafCount struct when the node type is const TensorAssignOp +template <typename LHSExpr, typename RHSExpr> +struct LeafCount<const TensorAssignOp<LHSExpr, RHSExpr> >: CategoryCount<LHSExpr,RHSExpr> {}; -SYCLLEAFCOUNTASSIGNOP(const) -SYCLLEAFCOUNTASSIGNOP() -#undef SYCLLEAFCOUNTASSIGNOP +/// specialisation of the \ref LeafCount struct when the node type is +/// TensorAssignOp is an exception. It is not the same as Unary +template <typename LHSExpr, typename RHSExpr> +struct LeafCount<TensorAssignOp<LHSExpr, RHSExpr> > :LeafCount<const TensorAssignOp<LHSExpr, RHSExpr> >{}; /// specialisation of the \ref LeafCount struct when the node type is const TensorForcedEvalOp -#define SYCLFORCEDEVALLEAFCOUNT(CVQual)\ -template <typename Expr>\ -struct LeafCount<CVQual TensorForcedEvalOp<Expr> > {\ - static const size_t Count =1;\ +template <typename Expr> +struct LeafCount<const TensorForcedEvalOp<Expr> > { + static const size_t Count =1; }; -SYCLFORCEDEVALLEAFCOUNT(const) -SYCLFORCEDEVALLEAFCOUNT() -#undef SYCLFORCEDEVALLEAFCOUNT +/// specialisation of the \ref LeafCount struct when the node type is TensorForcedEvalOp +template <typename Expr> +struct LeafCount<TensorForcedEvalOp<Expr> >: LeafCount<const TensorForcedEvalOp<Expr> > {}; -/// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp -#define EVALTOLEAFCOUNT(CVQual)\ -template <typename Expr>\ -struct LeafCount<CVQual TensorEvalToOp<Expr> > {\ - static const size_t Count = 1 + CategoryCount<Expr>::Count;\ +/// specialisation of the \ref LeafCount struct when the node type is const TensorEvalToOp +template <typename Expr> +struct LeafCount<const TensorEvalToOp<Expr> > { + static const size_t Count = 1 + CategoryCount<Expr>::Count; }; -EVALTOLEAFCOUNT(const) -EVALTOLEAFCOUNT() -#undef EVALTOLEAFCOUNT - /// specialisation of the \ref LeafCount struct when the node type is const TensorReductionOp -#define REDUCTIONLEAFCOUNT(CVQual)\ -template <typename OP, typename Dim, typename Expr>\ -struct LeafCount<CVQual TensorReductionOp<OP, Dim, Expr> > {\ - static const size_t Count =1;\ +template <typename OP, typename Dim, typename Expr> +struct LeafCount<const TensorReductionOp<OP, Dim, Expr> > { + static const size_t Count =1; }; -REDUCTIONLEAFCOUNT(const) -REDUCTIONLEAFCOUNT() -#undef REDUCTIONLEAFCOUNT - -/// specialisation of the \ref LeafCount struct when the node type is const TensorContractionOp -#define CONTRACTIONCONVOLUTIONLEAFCOUNT(CVQual, ExprNode)\ -template <typename Indices, typename LhsXprType, typename RhsXprType>\ -struct LeafCount<CVQual ExprNode<Indices, LhsXprType, RhsXprType> > {\ - static const size_t Count =1;\ -}; - -CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorContractionOp) -CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorContractionOp) -CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorConvolutionOp) -CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorConvolutionOp) -#undef CONTRACTIONCONVOLUTIONLEAFCOUNT - - - -/// specialisation of the \ref LeafCount struct when the node type is TensorSlicingOp -#define SLICEOPLEAFCOUNT(CVQual)\ -template <typename StartIndices, typename Sizes, typename XprType>\ -struct LeafCount<CVQual TensorSlicingOp<StartIndices, Sizes, XprType> >:CategoryCount<XprType>{}; - -SLICEOPLEAFCOUNT(const) -SLICEOPLEAFCOUNT() -#undef SLICEOPLEAFCOUNT - - -/// specialisation of the \ref LeafCount struct when the node type is TensorChippingOp -#define CHIPPINGOPLEAFCOUNT(CVQual)\ -template <DenseIndex DimId, typename XprType>\ -struct LeafCount<CVQual TensorChippingOp<DimId, XprType> >:CategoryCount<XprType>{}; - -CHIPPINGOPLEAFCOUNT(const) -CHIPPINGOPLEAFCOUNT() -#undef CHIPPINGOPLEAFCOUNT - - -#define SLICESTRIDEOPLEAFCOUNT(CVQual)\ -template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>\ -struct LeafCount<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >:CategoryCount<XprType>{}; - -SLICESTRIDEOPLEAFCOUNT(const) -SLICESTRIDEOPLEAFCOUNT() -#undef SLICESTRIDEOPLEAFCOUNT +/// specialisation of the \ref LeafCount struct when the node type is TensorReductionOp +template <typename OP, typename Dim, typename Expr> +struct LeafCount<TensorReductionOp<OP, Dim, Expr> >: LeafCount<const TensorReductionOp<OP, Dim, Expr> >{}; +/// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp +template <typename Expr> +struct LeafCount<TensorEvalToOp<Expr> >: LeafCount<const TensorEvalToOp<Expr> >{}; } /// namespace TensorSycl } /// namespace internal diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h index 74566dc..d4c250c 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h @@ -122,9 +122,9 @@ ASSIGNEXPR() /// specialisation of the \ref PlaceHolderExpression when the node is /// TensorMap #define TENSORMAPEXPR(CVQual)\ -template <typename T, int Options_, template <class> class MakePointer_, size_t N>\ -struct PlaceHolderExpression< CVQual TensorMap< T, Options_, MakePointer_>, N> {\ - typedef CVQual PlaceHolder<CVQual TensorMap<T, Options_, MakePointer_>, N> Type;\ +template <typename Scalar_, int Options_, int Options2_, int NumIndices_, typename IndexType_, template <class> class MakePointer_, size_t N>\ +struct PlaceHolderExpression< CVQual TensorMap< Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakePointer_>, N> {\ + typedef CVQual PlaceHolder<CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakePointer_>, N> Type;\ }; TENSORMAPEXPR(const) @@ -157,18 +157,6 @@ EVALTO() /// specialisation of the \ref PlaceHolderExpression when the node is -/// TensorChippingOp -#define CHIPPINGOP(CVQual)\ -template <DenseIndex DimId, typename Expr, size_t N>\ -struct PlaceHolderExpression<CVQual TensorChippingOp<DimId, Expr>, N> {\ - typedef CVQual TensorChippingOp< DimId, typename CalculateIndex <N, Expr>::ArgType> Type;\ -}; - -CHIPPINGOP(const) -CHIPPINGOP() -#undef CHIPPINGOP - -/// specialisation of the \ref PlaceHolderExpression when the node is /// TensorReductionOp #define SYCLREDUCTION(CVQual)\ template <typename OP, typename Dims, typename Expr, size_t N>\ @@ -179,45 +167,6 @@ SYCLREDUCTION(const) SYCLREDUCTION() #undef SYCLREDUCTION - -/// specialisation of the \ref PlaceHolderExpression when the node is -/// TensorReductionOp -#define SYCLCONTRACTIONCONVOLUTIONPLH(CVQual, ExprNode)\ -template <typename Indices, typename LhsXprType, typename RhsXprType, size_t N>\ -struct PlaceHolderExpression<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>{\ - typedef CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N> Type;\ -}; -SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorContractionOp) -SYCLCONTRACTIONCONVOLUTIONPLH(,TensorContractionOp) -SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorConvolutionOp) -SYCLCONTRACTIONCONVOLUTIONPLH(,TensorConvolutionOp) -#undef SYCLCONTRACTIONCONVOLUTIONPLH - - -/// specialisation of the \ref PlaceHolderExpression when the node is -/// TensorCwiseSelectOp -#define SLICEOPEXPR(CVQual)\ -template <typename StartIndices, typename Sizes, typename XprType, size_t N>\ -struct PlaceHolderExpression<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, N> {\ - typedef CVQual TensorSlicingOp<StartIndices, Sizes, typename CalculateIndex<N, XprType>::ArgType> Type;\ -}; - -SLICEOPEXPR(const) -SLICEOPEXPR() -#undef SLICEOPEXPR - - -#define SYCLSLICESTRIDEOPPLH(CVQual)\ -template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, size_t N>\ -struct PlaceHolderExpression<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, N> {\ - typedef CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, typename CalculateIndex<N, XprType>::ArgType> Type;\ -}; - -SYCLSLICESTRIDEOPPLH(const) -SYCLSLICESTRIDEOPPLH() -#undef SYCLSLICESTRIDEOPPLH - - /// template deduction for \ref PlaceHolderExpression struct template <typename Expr> struct createPlaceHolderExpression { diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h index cac7855..7914b6f 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h @@ -25,70 +25,43 @@ namespace Eigen { namespace TensorSycl { - -template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecExprFunctorKernel{ - typedef typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr; - - typedef typename Expr::Index Index; - FunctorExpr functors; - TupleType tuple_of_accessors; - Index range; - ExecExprFunctorKernel(Index range_, FunctorExpr functors_, TupleType tuple_of_accessors_) - : functors(functors_), tuple_of_accessors(tuple_of_accessors_), range(range_){} - void operator()(cl::sycl::nd_item<1> itemID) { - typedef typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr; - auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); - typename DevExpr::Index gId = static_cast<typename DevExpr::Index>(itemID.get_global_linear_id()); - if (gId < range) - device_evaluator.evalScalar(gId); - } -}; - /// The run function in tensor sycl convert the expression tree to a buffer /// based expression tree; /// creates the expression tree for the device with accessor to buffers; /// construct the kernel and submit it to the sycl queue. -/// std::array does not have TotalSize. So I have to get the size through template specialisation. -template<typename , typename Dimensions> struct DimensionSize{ - static auto getDimSize(const Dimensions& dim)->decltype(dim.TotalSize()){ - return dim.TotalSize(); - } -}; -#define DIMSIZEMACRO(CVQual)\ -template<typename Index, size_t NumDims> struct DimensionSize<Index, CVQual std::array<Index, NumDims>>{\ - static inline Index getDimSize(const std::array<Index, NumDims>& dim){\ - return (NumDims == 0) ? 1 : ::Eigen::internal::array_prod(dim);\ - }\ -}; - -DIMSIZEMACRO(const) -DIMSIZEMACRO() -#undef DIMSIZEMACRO - - template <typename Expr, typename Dev> void run(Expr &expr, Dev &dev) { Eigen::TensorEvaluator<Expr, Dev> evaluator(expr, dev); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { - typedef Eigen::TensorSycl::internal::FunctorExtractor<Eigen::TensorEvaluator<Expr, Dev> > FunctorExpr; - FunctorExpr functors = internal::extractFunctors(evaluator); - dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { - // create a tuple of accessors from Evaluator - typedef decltype(internal::createTupleOfAccessors<Eigen::TensorEvaluator<Expr, Dev> >(cgh, evaluator)) TupleType; - TupleType tuple_of_accessors = internal::createTupleOfAccessors<Eigen::TensorEvaluator<Expr, Dev> >(cgh, evaluator); - typename Expr::Index range, GRange, tileSize; - typename Expr::Index total_size = static_cast<typename Expr::Index>(DimensionSize<typename Expr::Index, typename Eigen::TensorEvaluator<Expr, Dev>::Dimensions>::getDimSize(evaluator.dimensions())); - dev.parallel_for_setup(total_size, tileSize, range, GRange); + typedef typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr; + auto functors = internal::extractFunctors(evaluator); - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), - ExecExprFunctorKernel<Expr,FunctorExpr,TupleType>(range - , functors, tuple_of_accessors - )); + size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2; + dev.m_queue.submit([&](cl::sycl::handler &cgh) { + + // create a tuple of accessors from Evaluator + auto tuple_of_accessors = internal::createTupleOfAccessors<decltype(evaluator)>(cgh, evaluator); + const auto range = utility::tuple::get<0>(tuple_of_accessors).get_range()[0]; + size_t GRange=range; + if (tileSize>GRange) tileSize=GRange; + else if(GRange>tileSize){ + size_t xMode = GRange % tileSize; + if (xMode != 0) GRange += (tileSize - xMode); + } + // run the kernel + cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) { + typedef typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr; + auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors); + auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice()); + if (itemID.get_global_linear_id() < range) { + device_evaluator.evalScalar(static_cast<int>(itemID.get_global_linear_id())); + } + }); }); - dev.asynchronousExec(); + dev.m_queue.throw_asynchronous(); } + evaluator.cleanup(); } } // namespace TensorSycl diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h index 58ab0f0..063b027 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h @@ -20,7 +20,6 @@ #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP - namespace utility { namespace tuple { /// \struct StaticIf @@ -232,5 +231,4 @@ Tuple<Args1..., Args2...> append(Tuple<Args1...> t1,Tuple<Args2...> t2) { } } // tuple } // utility - #endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h index a1e944e..ffcf8b0 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h @@ -58,8 +58,6 @@ struct traits<Tensor<Scalar_, NumIndices_, Options_, IndexType_> > }; template <typename T> struct MakePointer { typedef T* Type; - typedef T& RefType; - }; }; @@ -78,8 +76,6 @@ struct traits<TensorFixedSize<Scalar_, Dimensions, Options_, IndexType_> > }; template <typename T> struct MakePointer { typedef T* Type; - typedef T& RefType; - }; }; @@ -102,8 +98,6 @@ struct traits<TensorMap<PlainObjectType, Options_, MakePointer_> > // Intermediate typedef to workaround MSVC issue. typedef MakePointer_<T> MakePointerT; typedef typename MakePointerT::Type Type; - typedef typename MakePointerT::RefType RefType; - }; }; diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h index d23f2e4..3523e7c 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -23,7 +23,6 @@ struct static_val { template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) { - EIGEN_UNUSED_VARIABLE(v); eigen_assert(v == n); } }; diff --git a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h index 9dcc9da..354bce5 100644 --- a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h +++ b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h @@ -20,13 +20,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { typedef RunQueue<Task, 1024> Queue; NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment()) - : NonBlockingThreadPoolTempl(num_threads, true, env) {} - - NonBlockingThreadPoolTempl(int num_threads, bool allow_spinning, - Environment env = Environment()) - : num_threads_(num_threads), - allow_spinning_(allow_spinning), - env_(env), + : env_(env), threads_(num_threads), queues_(num_threads), coprimes_(num_threads), @@ -34,20 +28,19 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { blocked_(0), spinning_(0), done_(false), - cancelled_(false), ec_(waiters_) { - waiters_.resize(num_threads_); + waiters_.resize(num_threads); - // Calculate coprimes of num_threads_. + // Calculate coprimes of num_threads. // Coprimes are used for a random walk over all threads in Steal // and NonEmptyQueueIndex. Iteration is based on the fact that if we take // a walk starting thread index t and calculate num_threads - 1 subsequent // indices as (t + coprime) % num_threads, we will cover all threads without // repetitions (effectively getting a presudo-random permutation of thread // indices). - for (int i = 1; i <= num_threads_; i++) { + for (int i = 1; i <= num_threads; i++) { unsigned a = i; - unsigned b = num_threads_; + unsigned b = num_threads; // If GCD(a, b) == 1, then a and b are coprimes. while (b != 0) { unsigned tmp = a; @@ -58,33 +51,24 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { coprimes_.push_back(i); } } - for (int i = 0; i < num_threads_; i++) { + for (int i = 0; i < num_threads; i++) { queues_.push_back(new Queue()); } - for (int i = 0; i < num_threads_; i++) { + for (int i = 0; i < num_threads; i++) { threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); })); } } ~NonBlockingThreadPoolTempl() { done_ = true; - // Now if all threads block without work, they will start exiting. // But note that threads can continue to work arbitrary long, // block, submit new work, unblock and otherwise live full life. - if (!cancelled_) { - ec_.Notify(true); - } else { - // Since we were cancelled, there might be entries in the queues. - // Empty them to prevent their destructor from asserting. - for (size_t i = 0; i < queues_.size(); i++) { - queues_[i]->Flush(); - } - } + ec_.Notify(true); // Join threads explicitly to avoid destruction order issues. - for (size_t i = 0; i < num_threads_; i++) delete threads_[i]; - for (size_t i = 0; i < num_threads_; i++) delete queues_[i]; + for (size_t i = 0; i < threads_.size(); i++) delete threads_[i]; + for (size_t i = 0; i < threads_.size(); i++) delete queues_[i]; } void Schedule(std::function<void()> fn) { @@ -107,31 +91,14 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { // completes overall computations, which in turn leads to destruction of // this. We expect that such scenario is prevented by program, that is, // this is kept alive while any threads can potentially be in Schedule. - if (!t.f) { + if (!t.f) ec_.Notify(false); - } - else { + else env_.ExecuteTask(t); // Push failed, execute directly. - } - } - - void Cancel() { - cancelled_ = true; - done_ = true; - - // Let each thread know it's been cancelled. -#ifdef EIGEN_THREAD_ENV_SUPPORTS_CANCELLATION - for (size_t i = 0; i < threads_.size(); i++) { - threads_[i]->OnCancel(); - } -#endif - - // Wake up the threads without work to let them exit on their own. - ec_.Notify(true); } int NumThreads() const final { - return num_threads_; + return static_cast<int>(threads_.size()); } int CurrentThreadId() const final { @@ -155,8 +122,6 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { }; Environment env_; - const int num_threads_; - const bool allow_spinning_; MaxSizeVector<Thread*> threads_; MaxSizeVector<Queue*> queues_; MaxSizeVector<unsigned> coprimes_; @@ -164,7 +129,6 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { std::atomic<unsigned> blocked_; std::atomic<bool> spinning_; std::atomic<bool> done_; - std::atomic<bool> cancelled_; EventCount ec_; // Main worker thread loop. @@ -175,62 +139,32 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { pt->thread_id = thread_id; Queue* q = queues_[thread_id]; EventCount::Waiter* waiter = &waiters_[thread_id]; - // TODO(dvyukov,rmlarsen): The time spent in Steal() is proportional - // to num_threads_ and we assume that new work is scheduled at a - // constant rate, so we set spin_count to 5000 / num_threads_. The - // constant was picked based on a fair dice roll, tune it. - const int spin_count = - allow_spinning_ && num_threads_ > 0 ? 5000 / num_threads_ : 0; - if (num_threads_ == 1) { - // For num_threads_ == 1 there is no point in going through the expensive - // steal loop. Moreover, since Steal() calls PopBack() on the victim - // queues it might reverse the order in which ops are executed compared to - // the order in which they are scheduled, which tends to be - // counter-productive for the types of I/O workloads the single thread - // pools tend to be used for. - while (!cancelled_) { - Task t = q->PopFront(); - for (int i = 0; i < spin_count && !t.f; i++) { - if (!cancelled_.load(std::memory_order_relaxed)) { - t = q->PopFront(); - } - } + for (;;) { + Task t = q->PopFront(); + if (!t.f) { + t = Steal(); if (!t.f) { - if (!WaitForWork(waiter, &t)) { - return; + // Leave one thread spinning. This reduces latency. + // TODO(dvyukov): 1000 iterations is based on fair dice roll, tune it. + // Also, the time it takes to attempt to steal work 1000 times depends + // on the size of the thread pool. However the speed at which the user + // of the thread pool submit tasks is independent of the size of the + // pool. Consider a time based limit instead. + if (!spinning_ && !spinning_.exchange(true)) { + for (int i = 0; i < 1000 && !t.f; i++) { + t = Steal(); + } + spinning_ = false; } - } - if (t.f) { - env_.ExecuteTask(t); - } - } - } else { - while (!cancelled_) { - Task t = q->PopFront(); - if (!t.f) { - t = Steal(); if (!t.f) { - // Leave one thread spinning. This reduces latency. - if (allow_spinning_ && !spinning_ && !spinning_.exchange(true)) { - for (int i = 0; i < spin_count && !t.f; i++) { - if (!cancelled_.load(std::memory_order_relaxed)) { - t = Steal(); - } else { - return; - } - } - spinning_ = false; - } - if (!t.f) { - if (!WaitForWork(waiter, &t)) { - return; - } + if (!WaitForWork(waiter, &t)) { + return; } } } - if (t.f) { - env_.ExecuteTask(t); - } + } + if (t.f) { + env_.ExecuteTask(t); } } } @@ -267,18 +201,14 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface { int victim = NonEmptyQueueIndex(); if (victim != -1) { ec_.CancelWait(waiter); - if (cancelled_) { - return false; - } else { - *t = queues_[victim]->PopBack(); - return true; - } + *t = queues_[victim]->PopBack(); + return true; } // Number of blocked threads is used as termination condition. // If we are shutting down and all worker threads blocked without work, // that's we are done. blocked_++; - if (done_ && blocked_ == num_threads_) { + if (done_ && blocked_ == threads_.size()) { ec_.CancelWait(waiter); // Almost done, but need to re-check queues. // Consider that all queues are empty and all worker threads are preempted diff --git a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h index 49d0cdc..05ed76c 100644 --- a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h +++ b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h @@ -177,13 +177,6 @@ class RunQueue { // Can be called by any thread at any time. bool Empty() const { return Size() == 0; } - // Delete all the elements from the queue. - void Flush() { - while (!Empty()) { - PopFront(); - } - } - private: static const unsigned kMask = kSize - 1; static const unsigned kMask2 = (kSize << 1) - 1; diff --git a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h index 3357286..e75d0f4 100644 --- a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h +++ b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h @@ -69,14 +69,6 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface { } } - void Cancel() { -#ifdef EIGEN_THREAD_ENV_SUPPORTS_CANCELLATION - for (size_t i = 0; i < threads_.size(); i++) { - threads_[i]->OnCancel(); - } -#endif - } - int NumThreads() const final { return static_cast<int>(threads_.size()); } diff --git a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h deleted file mode 100644 index a05685f..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h +++ /dev/null @@ -1,23 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H -#define EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H - -// Try to come up with a portable way to cancel a thread -#if EIGEN_OS_GNULINUX - #define EIGEN_THREAD_CANCEL(t) \ - pthread_cancel(t.native_handle()); - #define EIGEN_SUPPORTS_THREAD_CANCELLATION 1 -#else -#define EIGEN_THREAD_CANCEL(t) -#endif - - -#endif // EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H diff --git a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h index d94a064..399f95c 100644 --- a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h +++ b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h @@ -23,8 +23,6 @@ struct StlThreadEnvironment { public: EnvThread(std::function<void()> f) : thr_(std::move(f)) {} ~EnvThread() { thr_.join(); } - // This function is called when the threadpool is cancelled. - void OnCancel() { } private: std::thread thr_; diff --git a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h index 84e1e6c..a65ee97 100644 --- a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h +++ b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h @@ -16,14 +16,8 @@ namespace Eigen { // custom thread pools underneath. class ThreadPoolInterface { public: - // Submits a closure to be run by a thread in the pool. virtual void Schedule(std::function<void()> fn) = 0; - // If implemented, stop processing the closures that have been enqueued. - // Currently running closures may still be processed. - // If not implemented, does nothing. - virtual void Cancel() {} - // Returns the number of threads in the pool. virtual int NumThreads() const = 0; diff --git a/eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h b/eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h index 49d315a..ec27edd 100644 --- a/eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h +++ b/eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h @@ -40,7 +40,7 @@ template<typename T, T... nn> struct numeric_list { constexpr static std::size_t count = sizeof...(nn); }; template<typename T, T n, T... nn> -struct numeric_list<T, n, nn...> { static const std::size_t count = sizeof...(nn) + 1; const static T first_value = n; }; +struct numeric_list<T, n, nn...> { constexpr static std::size_t count = sizeof...(nn) + 1; constexpr static T first_value = n; }; /* numeric list constructors * @@ -123,10 +123,6 @@ template<typename a, typename... as> struct get<0, type_lis template<typename T, int n, T a, T... as> struct get<n, numeric_list<T, a, as...>> : get<n-1, numeric_list<T, as...>> {}; template<typename T, T a, T... as> struct get<0, numeric_list<T, a, as...>> { constexpr static T value = a; }; -template<std::size_t n, typename T, T a, T... as> constexpr T array_get(const numeric_list<T, a, as...>&) { - return get<(int)n, numeric_list<T, a, as...>>::value; -} - /* always get type, regardless of dummy; good for parameter pack expansion */ template<typename T, T dummy, typename t> struct id_numeric { typedef t type; }; diff --git a/eigen/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/eigen/unsupported/Eigen/CXX11/src/util/EmulateArray.h index 573ca43..30d3ebc 100644 --- a/eigen/unsupported/Eigen/CXX11/src/util/EmulateArray.h +++ b/eigen/unsupported/Eigen/CXX11/src/util/EmulateArray.h @@ -169,7 +169,6 @@ template <typename T> class array<T, 0> { #if EIGEN_HAS_VARIADIC_TEMPLATES EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() { - EIGEN_UNUSED_VARIABLE(l); eigen_assert(l.size() == 0); } #endif @@ -201,15 +200,19 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T,N>& a) { return a[I]; } +template <typename T> struct array_size; template<class T, std::size_t N> struct array_size<array<T,N> > { static const size_t value = N; }; +template <typename T> struct array_size; template<class T, std::size_t N> struct array_size<array<T,N>& > { static const size_t value = N; }; +template <typename T> struct array_size; template<class T, std::size_t N> struct array_size<const array<T,N> > { static const size_t value = N; }; +template <typename T> struct array_size; template<class T, std::size_t N> struct array_size<const array<T,N>& > { static const size_t value = N; }; @@ -248,6 +251,14 @@ template<std::size_t I, class T, std::size_t N> constexpr inline T const& array_ #undef STD_GET_ARR_HACK +template <typename T> struct array_size; +template<class T, std::size_t N> struct array_size<const std::array<T,N> > { + static const size_t value = N; +}; +template <typename T> struct array_size; +template<class T, std::size_t N> struct array_size<std::array<T,N> > { + static const size_t value = N; +}; } // end namespace internal } // end namespace Eigen diff --git a/eigen/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/eigen/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h index d280886..279fe5c 100644 --- a/eigen/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h +++ b/eigen/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h @@ -683,4 +683,11 @@ template<typename DerType> struct NumTraits<AutoDiffScalar<DerType> > } +namespace std { +template <typename T> +class numeric_limits<Eigen::AutoDiffScalar<T> > + : public numeric_limits<typename T::Scalar> {}; + +} // namespace std + #endif // EIGEN_AUTODIFF_SCALAR_H diff --git a/eigen/unsupported/Eigen/src/EulerAngles/EulerAngles.h b/eigen/unsupported/Eigen/src/EulerAngles/EulerAngles.h index a5d034d..13a0da1 100644 --- a/eigen/unsupported/Eigen/src/EulerAngles/EulerAngles.h +++ b/eigen/unsupported/Eigen/src/EulerAngles/EulerAngles.h @@ -12,6 +12,11 @@ namespace Eigen { + /*template<typename Other, + int OtherRows=Other::RowsAtCompileTime, + int OtherCols=Other::ColsAtCompileTime> + struct ei_eulerangles_assign_impl;*/ + /** \class EulerAngles * * \ingroup EulerAngles_Module @@ -31,7 +36,7 @@ namespace Eigen * ### Rotation representation and conversions ### * * It has been proved(see Wikipedia link below) that every rotation can be represented - * by Euler angles, but there is no single representation (e.g. unlike rotation matrices). + * by Euler angles, but there is no singular representation (e.g. unlike rotation matrices). * Therefore, you can convert from Eigen rotation and to them * (including rotation matrices, which is not called "rotations" by Eigen design). * @@ -50,27 +55,33 @@ namespace Eigen * Additionally, some axes related computation is done in compile time. * * #### Euler angles ranges in conversions #### - * Rotations representation as EulerAngles are not single (unlike matrices), - * and even have infinite EulerAngles representations.<BR> - * For example, add or subtract 2*PI from either angle of EulerAngles - * and you'll get the same rotation. - * This is the general reason for infinite representation, - * but it's not the only general reason for not having a single representation. * - * When converting rotation to EulerAngles, this class convert it to specific ranges - * When converting some rotation to EulerAngles, the rules for ranges are as follow: - * - If the rotation we converting from is an EulerAngles - * (even when it represented as RotationBase explicitly), angles ranges are __undefined__. - * - otherwise, alpha and gamma angles will be in the range [-PI, PI].<BR> - * As for Beta angle: - * - If the system is Tait-Bryan, the beta angle will be in the range [-PI/2, PI/2]. - * - otherwise: - * - If the beta axis is positive, the beta angle will be in the range [0, PI] - * - If the beta axis is negative, the beta angle will be in the range [-PI, 0] + * When converting some rotation to Euler angles, there are some ways you can guarantee + * the Euler angles ranges. * + * #### implicit ranges #### + * When using implicit ranges, all angles are guarantee to be in the range [-PI, +PI], + * unless you convert from some other Euler angles. + * In this case, the range is __undefined__ (might be even less than -PI or greater than +2*PI). * \sa EulerAngles(const MatrixBase<Derived>&) * \sa EulerAngles(const RotationBase<Derived, 3>&) * + * #### explicit ranges #### + * When using explicit ranges, all angles are guarantee to be in the range you choose. + * In the range Boolean parameter, you're been ask whether you prefer the positive range or not: + * - _true_ - force the range between [0, +2*PI] + * - _false_ - force the range between [-PI, +PI] + * + * ##### compile time ranges ##### + * This is when you have compile time ranges and you prefer to + * use template parameter. (e.g. for performance) + * \sa FromRotation() + * + * ##### run-time time ranges ##### + * Run-time ranges are also supported. + * \sa EulerAngles(const MatrixBase<Derived>&, bool, bool, bool) + * \sa EulerAngles(const RotationBase<Derived, 3>&, bool, bool, bool) + * * ### Convenient user typedefs ### * * Convenient typedefs for EulerAngles exist for float and double scalar, @@ -92,7 +103,7 @@ namespace Eigen * * More information about Euler angles: https://en.wikipedia.org/wiki/Euler_angles * - * \tparam _Scalar the scalar type, i.e. the type of the angles. + * \tparam _Scalar the scalar type, i.e., the type of the angles. * * \tparam _System the EulerSystem to use, which represents the axes of rotation. */ @@ -100,11 +111,8 @@ namespace Eigen class EulerAngles : public RotationBase<EulerAngles<_Scalar, _System>, 3> { public: - typedef RotationBase<EulerAngles<_Scalar, _System>, 3> Base; - /** the scalar type of the angles */ typedef _Scalar Scalar; - typedef typename NumTraits<Scalar>::Real RealScalar; /** the EulerSystem to use, which represents the axes of rotation. */ typedef _System System; @@ -138,56 +146,67 @@ namespace Eigen public: /** Default constructor without initialization. */ EulerAngles() {} - /** Constructs and initialize an EulerAngles (\p alpha, \p beta, \p gamma). */ + /** Constructs and initialize Euler angles(\p alpha, \p beta, \p gamma). */ EulerAngles(const Scalar& alpha, const Scalar& beta, const Scalar& gamma) : m_angles(alpha, beta, gamma) {} - // TODO: Test this constructor - /** Constructs and initialize an EulerAngles from the array data {alpha, beta, gamma} */ - explicit EulerAngles(const Scalar* data) : m_angles(data) {} + /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m. + * + * \note All angles will be in the range [-PI, PI]. + */ + template<typename Derived> + EulerAngles(const MatrixBase<Derived>& m) { *this = m; } - /** Constructs and initializes an EulerAngles from either: - * - a 3x3 rotation matrix expression(i.e. pure orthogonal matrix with determinant of +1), - * - a 3D vector expression representing Euler angles. + /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m, + * with options to choose for each angle the requested range. + * + * If positive range is true, then the specified angle will be in the range [0, +2*PI]. + * Otherwise, the specified angle will be in the range [-PI, +PI]. * - * \note If \p other is a 3x3 rotation matrix, the angles range rules will be as follow:<BR> - * Alpha and gamma angles will be in the range [-PI, PI].<BR> - * As for Beta angle: - * - If the system is Tait-Bryan, the beta angle will be in the range [-PI/2, PI/2]. - * - otherwise: - * - If the beta axis is positive, the beta angle will be in the range [0, PI] - * - If the beta axis is negative, the beta angle will be in the range [-PI, 0] - */ + * \param m The 3x3 rotation matrix to convert + * \param positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI]. + * \param positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI]. + * \param positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI]. + */ template<typename Derived> - explicit EulerAngles(const MatrixBase<Derived>& other) { *this = other; } + EulerAngles( + const MatrixBase<Derived>& m, + bool positiveRangeAlpha, + bool positiveRangeBeta, + bool positiveRangeGamma) { + + System::CalcEulerAngles(*this, m, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma); + } /** Constructs and initialize Euler angles from a rotation \p rot. * - * \note If \p rot is an EulerAngles (even when it represented as RotationBase explicitly), - * angles ranges are __undefined__. - * Otherwise, alpha and gamma angles will be in the range [-PI, PI].<BR> - * As for Beta angle: - * - If the system is Tait-Bryan, the beta angle will be in the range [-PI/2, PI/2]. - * - otherwise: - * - If the beta axis is positive, the beta angle will be in the range [0, PI] - * - If the beta axis is negative, the beta angle will be in the range [-PI, 0] + * \note All angles will be in the range [-PI, PI], unless \p rot is an EulerAngles. + * If rot is an EulerAngles, expected EulerAngles range is __undefined__. + * (Use other functions here for enforcing range if this effect is desired) */ template<typename Derived> - EulerAngles(const RotationBase<Derived, 3>& rot) { System::CalcEulerAngles(*this, rot.toRotationMatrix()); } + EulerAngles(const RotationBase<Derived, 3>& rot) { *this = rot; } - /*EulerAngles(const QuaternionType& q) - { - // TODO: Implement it in a faster way for quaternions - // According to http://www.euclideanspace.com/maths/geometry/rotations/conversions/quaternionToEuler/ - // we can compute only the needed matrix cells and then convert to euler angles. (see ZYX example below) - // Currently we compute all matrix cells from quaternion. - - // Special case only for ZYX - //Scalar y2 = q.y() * q.y(); - //m_angles[0] = std::atan2(2*(q.w()*q.z() + q.x()*q.y()), (1 - 2*(y2 + q.z()*q.z()))); - //m_angles[1] = std::asin( 2*(q.w()*q.y() - q.z()*q.x())); - //m_angles[2] = std::atan2(2*(q.w()*q.x() + q.y()*q.z()), (1 - 2*(q.x()*q.x() + y2))); - }*/ + /** Constructs and initialize Euler angles from a rotation \p rot, + * with options to choose for each angle the requested range. + * + * If positive range is true, then the specified angle will be in the range [0, +2*PI]. + * Otherwise, the specified angle will be in the range [-PI, +PI]. + * + * \param rot The 3x3 rotation matrix to convert + * \param positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI]. + * \param positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI]. + * \param positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI]. + */ + template<typename Derived> + EulerAngles( + const RotationBase<Derived, 3>& rot, + bool positiveRangeAlpha, + bool positiveRangeBeta, + bool positiveRangeGamma) { + + System::CalcEulerAngles(*this, rot.toRotationMatrix(), positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma); + } /** \returns The angle values stored in a vector (alpha, beta, gamma). */ const Vector3& angles() const { return m_angles; } @@ -227,48 +246,90 @@ namespace Eigen return inverse(); } - /** Set \c *this from either: - * - a 3x3 rotation matrix expression(i.e. pure orthogonal matrix with determinant of +1), - * - a 3D vector expression representing Euler angles. + /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m, + * with options to choose for each angle the requested range (__only in compile time__). * - * See EulerAngles(const MatrixBase<Derived, 3>&) for more information about - * angles ranges output. + * If positive range is true, then the specified angle will be in the range [0, +2*PI]. + * Otherwise, the specified angle will be in the range [-PI, +PI]. + * + * \param m The 3x3 rotation matrix to convert + * \tparam positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI]. + * \tparam positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI]. + * \tparam positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI]. + */ + template< + bool PositiveRangeAlpha, + bool PositiveRangeBeta, + bool PositiveRangeGamma, + typename Derived> + static EulerAngles FromRotation(const MatrixBase<Derived>& m) + { + EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived, 3, 3) + + EulerAngles e; + System::template CalcEulerAngles< + PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma, _Scalar>(e, m); + return e; + } + + /** Constructs and initialize Euler angles from a rotation \p rot, + * with options to choose for each angle the requested range (__only in compile time__). + * + * If positive range is true, then the specified angle will be in the range [0, +2*PI]. + * Otherwise, the specified angle will be in the range [-PI, +PI]. + * + * \param rot The 3x3 rotation matrix to convert + * \tparam positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI]. + * \tparam positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI]. + * \tparam positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI]. */ - template<class Derived> - EulerAngles& operator=(const MatrixBase<Derived>& other) + template< + bool PositiveRangeAlpha, + bool PositiveRangeBeta, + bool PositiveRangeGamma, + typename Derived> + static EulerAngles FromRotation(const RotationBase<Derived, 3>& rot) + { + return FromRotation<PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma>(rot.toRotationMatrix()); + } + + /*EulerAngles& fromQuaternion(const QuaternionType& q) { - EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename Derived::Scalar>::value), - YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) + // TODO: Implement it in a faster way for quaternions + // According to http://www.euclideanspace.com/maths/geometry/rotations/conversions/quaternionToEuler/ + // we can compute only the needed matrix cells and then convert to euler angles. (see ZYX example below) + // Currently we compute all matrix cells from quaternion. + + // Special case only for ZYX + //Scalar y2 = q.y() * q.y(); + //m_angles[0] = std::atan2(2*(q.w()*q.z() + q.x()*q.y()), (1 - 2*(y2 + q.z()*q.z()))); + //m_angles[1] = std::asin( 2*(q.w()*q.y() - q.z()*q.x())); + //m_angles[2] = std::atan2(2*(q.w()*q.x() + q.y()*q.z()), (1 - 2*(q.x()*q.x() + y2))); + }*/ + + /** Set \c *this from a rotation matrix(i.e. pure orthogonal matrix with determinant of +1). */ + template<typename Derived> + EulerAngles& operator=(const MatrixBase<Derived>& m) { + EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived, 3, 3) - internal::eulerangles_assign_impl<System, Derived>::run(*this, other.derived()); + System::CalcEulerAngles(*this, m); return *this; } // TODO: Assign and construct from another EulerAngles (with different system) - /** Set \c *this from a rotation. - * - * See EulerAngles(const RotationBase<Derived, 3>&) for more information about - * angles ranges output. - */ + /** Set \c *this from a rotation. */ template<typename Derived> EulerAngles& operator=(const RotationBase<Derived, 3>& rot) { System::CalcEulerAngles(*this, rot.toRotationMatrix()); return *this; } - /** \returns \c true if \c *this is approximately equal to \a other, within the precision - * determined by \a prec. - * - * \sa MatrixBase::isApprox() */ - bool isApprox(const EulerAngles& other, - const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const - { return angles().isApprox(other.angles(), prec); } + // TODO: Support isApprox function /** \returns an equivalent 3x3 rotation matrix. */ Matrix3 toRotationMatrix() const { - // TODO: Calc it faster return static_cast<QuaternionType>(*this).toRotationMatrix(); } @@ -286,15 +347,6 @@ namespace Eigen s << eulerAngles.angles().transpose(); return s; } - - /** \returns \c *this with scalar type casted to \a NewScalarType */ - template <typename NewScalarType> - EulerAngles<NewScalarType, System> cast() const - { - EulerAngles<NewScalarType, System> e; - e.angles() = angles().template cast<NewScalarType>(); - return e; - } }; #define EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(AXES, SCALAR_TYPE, SCALAR_POSTFIX) \ @@ -327,29 +379,8 @@ EIGEN_EULER_ANGLES_TYPEDEFS(double, d) { typedef _Scalar Scalar; }; - - // set from a rotation matrix - template<class System, class Other> - struct eulerangles_assign_impl<System,Other,3,3> - { - typedef typename Other::Scalar Scalar; - static void run(EulerAngles<Scalar, System>& e, const Other& m) - { - System::CalcEulerAngles(e, m); - } - }; - - // set from a vector of Euler angles - template<class System, class Other> - struct eulerangles_assign_impl<System,Other,4,1> - { - typedef typename Other::Scalar Scalar; - static void run(EulerAngles<Scalar, System>& e, const Other& vec) - { - e.angles() = vec; - } - }; } + } #endif // EIGEN_EULERANGLESCLASS_H diff --git a/eigen/unsupported/Eigen/src/EulerAngles/EulerSystem.h b/eigen/unsupported/Eigen/src/EulerAngles/EulerSystem.h index 28f52da..98f9f64 100644 --- a/eigen/unsupported/Eigen/src/EulerAngles/EulerSystem.h +++ b/eigen/unsupported/Eigen/src/EulerAngles/EulerSystem.h @@ -18,7 +18,7 @@ namespace Eigen namespace internal { - // TODO: Add this trait to the Eigen internal API? + // TODO: Check if already exists on the rest API template <int Num, bool IsPositive = (Num > 0)> struct Abs { @@ -36,12 +36,6 @@ namespace Eigen { enum { value = Axis != 0 && Abs<Axis>::value <= 3 }; }; - - template<typename System, - typename Other, - int OtherRows=Other::RowsAtCompileTime, - int OtherCols=Other::ColsAtCompileTime> - struct eulerangles_assign_impl; } #define EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(COND,MSG) typedef char static_assertion_##MSG[(COND)?1:-1] @@ -75,7 +69,7 @@ namespace Eigen * * You can use this class to get two things: * - Build an Euler system, and then pass it as a template parameter to EulerAngles. - * - Query some compile time data about an Euler system. (e.g. Whether it's Tait-Bryan) + * - Query some compile time data about an Euler system. (e.g. Whether it's tait bryan) * * Euler rotation is a set of three rotation on fixed axes. (see \ref EulerAngles) * This meta-class store constantly those signed axes. (see \ref EulerAxis) @@ -86,7 +80,7 @@ namespace Eigen * signed axes{+X,+Y,+Z,-X,-Y,-Z} are supported: * - all axes X, Y, Z in each valid order (see below what order is valid) * - rotation over the axis is supported both over the positive and negative directions. - * - both Tait-Bryan and proper/classic Euler angles (i.e. the opposite). + * - both tait bryan and proper/classic Euler angles (i.e. the opposite). * * Since EulerSystem support both positive and negative directions, * you may call this rotation distinction in other names: @@ -96,7 +90,7 @@ namespace Eigen * Notice all axed combination are valid, and would trigger a static assertion. * Same unsigned axes can't be neighbors, e.g. {X,X,Y} is invalid. * This yield two and only two classes: - * - _Tait-Bryan_ - all unsigned axes are distinct, e.g. {X,Y,Z} + * - _tait bryan_ - all unsigned axes are distinct, e.g. {X,Y,Z} * - _proper/classic Euler angles_ - The first and the third unsigned axes is equal, * and the second is different, e.g. {X,Y,X} * @@ -118,9 +112,9 @@ namespace Eigen * * \tparam _AlphaAxis the first fixed EulerAxis * - * \tparam _BetaAxis the second fixed EulerAxis + * \tparam _AlphaAxis the second fixed EulerAxis * - * \tparam _GammaAxis the third fixed EulerAxis + * \tparam _AlphaAxis the third fixed EulerAxis */ template <int _AlphaAxis, int _BetaAxis, int _GammaAxis> class EulerSystem @@ -144,16 +138,14 @@ namespace Eigen BetaAxisAbs = internal::Abs<BetaAxis>::value, /*!< the second rotation axis unsigned */ GammaAxisAbs = internal::Abs<GammaAxis>::value, /*!< the third rotation axis unsigned */ - IsAlphaOpposite = (AlphaAxis < 0) ? 1 : 0, /*!< whether alpha axis is negative */ - IsBetaOpposite = (BetaAxis < 0) ? 1 : 0, /*!< whether beta axis is negative */ - IsGammaOpposite = (GammaAxis < 0) ? 1 : 0, /*!< whether gamma axis is negative */ - - // Parity is even if alpha axis X is followed by beta axis Y, or Y is followed - // by Z, or Z is followed by X; otherwise it is odd. - IsOdd = ((AlphaAxisAbs)%3 == (BetaAxisAbs - 1)%3) ? 0 : 1, /*!< whether the Euler system is odd */ - IsEven = IsOdd ? 0 : 1, /*!< whether the Euler system is even */ + IsAlphaOpposite = (AlphaAxis < 0) ? 1 : 0, /*!< weather alpha axis is negative */ + IsBetaOpposite = (BetaAxis < 0) ? 1 : 0, /*!< weather beta axis is negative */ + IsGammaOpposite = (GammaAxis < 0) ? 1 : 0, /*!< weather gamma axis is negative */ + + IsOdd = ((AlphaAxisAbs)%3 == (BetaAxisAbs - 1)%3) ? 0 : 1, /*!< weather the Euler system is odd */ + IsEven = IsOdd ? 0 : 1, /*!< weather the Euler system is even */ - IsTaitBryan = ((unsigned)AlphaAxisAbs != (unsigned)GammaAxisAbs) ? 1 : 0 /*!< whether the Euler system is Tait-Bryan */ + IsTaitBryan = ((unsigned)AlphaAxisAbs != (unsigned)GammaAxisAbs) ? 1 : 0 /*!< weather the Euler system is tait bryan */ }; private: @@ -188,70 +180,71 @@ namespace Eigen static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar, 3, 1>& res, const MatrixBase<Derived>& mat, internal::true_type /*isTaitBryan*/) { using std::atan2; - using std::sqrt; + using std::sin; + using std::cos; typedef typename Derived::Scalar Scalar; - - const Scalar plusMinus = IsEven? 1 : -1; - const Scalar minusPlus = IsOdd? 1 : -1; - - const Scalar Rsum = sqrt((mat(I,I) * mat(I,I) + mat(I,J) * mat(I,J) + mat(J,K) * mat(J,K) + mat(K,K) * mat(K,K))/2); - res[1] = atan2(plusMinus * mat(I,K), Rsum); - - // There is a singularity when cos(beta) == 0 - if(Rsum > 4 * NumTraits<Scalar>::epsilon()) {// cos(beta) != 0 - res[0] = atan2(minusPlus * mat(J, K), mat(K, K)); - res[2] = atan2(minusPlus * mat(I, J), mat(I, I)); - } - else if(plusMinus * mat(I, K) > 0) {// cos(beta) == 0 and sin(beta) == 1 - Scalar spos = mat(J, I) + plusMinus * mat(K, J); // 2*sin(alpha + plusMinus * gamma - Scalar cpos = mat(J, J) + minusPlus * mat(K, I); // 2*cos(alpha + plusMinus * gamma) - Scalar alphaPlusMinusGamma = atan2(spos, cpos); - res[0] = alphaPlusMinusGamma; - res[2] = 0; - } - else {// cos(beta) == 0 and sin(beta) == -1 - Scalar sneg = plusMinus * (mat(K, J) + minusPlus * mat(J, I)); // 2*sin(alpha + minusPlus*gamma) - Scalar cneg = mat(J, J) + plusMinus * mat(K, I); // 2*cos(alpha + minusPlus*gamma) - Scalar alphaMinusPlusBeta = atan2(sneg, cneg); - res[0] = alphaMinusPlusBeta; - res[2] = 0; + typedef Matrix<Scalar,2,1> Vector2; + + res[0] = atan2(mat(J,K), mat(K,K)); + Scalar c2 = Vector2(mat(I,I), mat(I,J)).norm(); + if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0))) { + if(res[0] > Scalar(0)) { + res[0] -= Scalar(EIGEN_PI); + } + else { + res[0] += Scalar(EIGEN_PI); + } + res[1] = atan2(-mat(I,K), -c2); } + else + res[1] = atan2(-mat(I,K), c2); + Scalar s1 = sin(res[0]); + Scalar c1 = cos(res[0]); + res[2] = atan2(s1*mat(K,I)-c1*mat(J,I), c1*mat(J,J) - s1 * mat(K,J)); } template <typename Derived> - static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar,3,1>& res, - const MatrixBase<Derived>& mat, internal::false_type /*isTaitBryan*/) + static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar,3,1>& res, const MatrixBase<Derived>& mat, internal::false_type /*isTaitBryan*/) { using std::atan2; - using std::sqrt; + using std::sin; + using std::cos; typedef typename Derived::Scalar Scalar; - - const Scalar plusMinus = IsEven? 1 : -1; - const Scalar minusPlus = IsOdd? 1 : -1; - - const Scalar Rsum = sqrt((mat(I, J) * mat(I, J) + mat(I, K) * mat(I, K) + mat(J, I) * mat(J, I) + mat(K, I) * mat(K, I)) / 2); - - res[1] = atan2(Rsum, mat(I, I)); - - // There is a singularity when sin(beta) == 0 - if(Rsum > 4 * NumTraits<Scalar>::epsilon()) {// sin(beta) != 0 - res[0] = atan2(mat(J, I), minusPlus * mat(K, I)); - res[2] = atan2(mat(I, J), plusMinus * mat(I, K)); - } - else if(mat(I, I) > 0) {// sin(beta) == 0 and cos(beta) == 1 - Scalar spos = plusMinus * mat(K, J) + minusPlus * mat(J, K); // 2*sin(alpha + gamma) - Scalar cpos = mat(J, J) + mat(K, K); // 2*cos(alpha + gamma) - res[0] = atan2(spos, cpos); - res[2] = 0; + typedef Matrix<Scalar,2,1> Vector2; + + res[0] = atan2(mat(J,I), mat(K,I)); + if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0))) + { + if(res[0] > Scalar(0)) { + res[0] -= Scalar(EIGEN_PI); + } + else { + res[0] += Scalar(EIGEN_PI); + } + Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm(); + res[1] = -atan2(s2, mat(I,I)); } - else {// sin(beta) == 0 and cos(beta) == -1 - Scalar sneg = plusMinus * mat(K, J) + plusMinus * mat(J, K); // 2*sin(alpha - gamma) - Scalar cneg = mat(J, J) - mat(K, K); // 2*cos(alpha - gamma) - res[0] = atan2(sneg, cneg); - res[2] = 0; + else + { + Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm(); + res[1] = atan2(s2, mat(I,I)); } + + // With a=(0,1,0), we have i=0; j=1; k=2, and after computing the first two angles, + // we can compute their respective rotation, and apply its inverse to M. Since the result must + // be a rotation around x, we have: + // + // c2 s1.s2 c1.s2 1 0 0 + // 0 c1 -s1 * M = 0 c3 s3 + // -s2 s1.c2 c1.c2 0 -s3 c3 + // + // Thus: m11.c1 - m21.s1 = c3 & m12.c1 - m22.s1 = s3 + + Scalar s1 = sin(res[0]); + Scalar c1 = cos(res[0]); + res[2] = atan2(c1*mat(J,K)-s1*mat(K,K), c1*mat(J,J) - s1 * mat(K,J)); } template<typename Scalar> @@ -259,28 +252,55 @@ namespace Eigen EulerAngles<Scalar, EulerSystem>& res, const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat) { + CalcEulerAngles(res, mat, false, false, false); + } + + template< + bool PositiveRangeAlpha, + bool PositiveRangeBeta, + bool PositiveRangeGamma, + typename Scalar> + static void CalcEulerAngles( + EulerAngles<Scalar, EulerSystem>& res, + const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat) + { + CalcEulerAngles(res, mat, PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma); + } + + template<typename Scalar> + static void CalcEulerAngles( + EulerAngles<Scalar, EulerSystem>& res, + const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat, + bool PositiveRangeAlpha, + bool PositiveRangeBeta, + bool PositiveRangeGamma) + { CalcEulerAngles_imp( res.angles(), mat, typename internal::conditional<IsTaitBryan, internal::true_type, internal::false_type>::type()); - if (IsAlphaOpposite) + if (IsAlphaOpposite == IsOdd) res.alpha() = -res.alpha(); - if (IsBetaOpposite) + if (IsBetaOpposite == IsOdd) res.beta() = -res.beta(); - if (IsGammaOpposite) + if (IsGammaOpposite == IsOdd) res.gamma() = -res.gamma(); + + // Saturate results to the requested range + if (PositiveRangeAlpha && (res.alpha() < 0)) + res.alpha() += Scalar(2 * EIGEN_PI); + + if (PositiveRangeBeta && (res.beta() < 0)) + res.beta() += Scalar(2 * EIGEN_PI); + + if (PositiveRangeGamma && (res.gamma() < 0)) + res.gamma() += Scalar(2 * EIGEN_PI); } template <typename _Scalar, class _System> friend class Eigen::EulerAngles; - - template<typename System, - typename Other, - int OtherRows, - int OtherCols> - friend struct internal::eulerangles_assign_impl; }; #define EIGEN_EULER_SYSTEM_TYPEDEF(A, B, C) \ diff --git a/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h b/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h index db2449d..3f7d777 100644 --- a/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h +++ b/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h @@ -398,8 +398,8 @@ struct matrix_function_compute template <typename MatrixType> struct matrix_function_compute<MatrixType, 0> { - template <typename AtomicType, typename ResultType> - static void run(const MatrixType& A, AtomicType& atomic, ResultType &result) + template <typename MatA, typename AtomicType, typename ResultType> + static void run(const MatA& A, AtomicType& atomic, ResultType &result) { typedef internal::traits<MatrixType> Traits; typedef typename Traits::Scalar Scalar; @@ -422,11 +422,10 @@ struct matrix_function_compute<MatrixType, 0> template <typename MatrixType> struct matrix_function_compute<MatrixType, 1> { - template <typename AtomicType, typename ResultType> - static void run(const MatrixType& A, AtomicType& atomic, ResultType &result) + template <typename MatA, typename AtomicType, typename ResultType> + static void run(const MatA& A, AtomicType& atomic, ResultType &result) { typedef internal::traits<MatrixType> Traits; - typedef typename MatrixType::Index Index; // compute Schur decomposition of A const ComplexSchur<MatrixType> schurOfA(A); @@ -514,7 +513,7 @@ template<typename Derived> class MatrixFunctionReturnValue typedef internal::MatrixFunctionAtomic<DynMatrixType> AtomicType; AtomicType atomic(m_f); - internal::matrix_function_compute<NestedEvalTypeClean>::run(m_A, atomic, result); + internal::matrix_function_compute<typename NestedEvalTypeClean::PlainObject>::run(m_A, atomic, result); } Index rows() const { return m_A.rows(); } diff --git a/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h index 1acfbed..ff8f6e7 100644 --- a/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h +++ b/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h @@ -339,7 +339,7 @@ public: typedef internal::MatrixLogarithmAtomic<DynMatrixType> AtomicType; AtomicType atomic; - internal::matrix_function_compute<DerivedEvalTypeClean>::run(m_A, atomic, result); + internal::matrix_function_compute<typename DerivedEvalTypeClean::PlainObject>::run(m_A, atomic, result); } Index rows() const { return m_A.rows(); } diff --git a/eigen/unsupported/Eigen/src/Polynomials/Companion.h b/eigen/unsupported/Eigen/src/Polynomials/Companion.h index e0af6eb..b515c29 100644 --- a/eigen/unsupported/Eigen/src/Polynomials/Companion.h +++ b/eigen/unsupported/Eigen/src/Polynomials/Companion.h @@ -75,7 +75,7 @@ class companion void setPolynomial( const VectorType& poly ) { const Index deg = poly.size()-1; - m_monic = Scalar(-1)/poly[deg] * poly.head(deg); + m_monic = -1/poly[deg] * poly.head(deg); //m_bl_diag.setIdentity( deg-1 ); m_bl_diag.setOnes(deg-1); } @@ -107,8 +107,8 @@ class companion * colB and rowB are repectively the multipliers for * the column and the row in order to balance them. * */ - bool balanced( RealScalar colNorm, RealScalar rowNorm, - bool& isBalanced, RealScalar& colB, RealScalar& rowB ); + bool balanced( Scalar colNorm, Scalar rowNorm, + bool& isBalanced, Scalar& colB, Scalar& rowB ); /** Helper function for the balancing algorithm. * \returns true if the row and the column, having colNorm and rowNorm @@ -116,8 +116,8 @@ class companion * colB and rowB are repectively the multipliers for * the column and the row in order to balance them. * */ - bool balancedR( RealScalar colNorm, RealScalar rowNorm, - bool& isBalanced, RealScalar& colB, RealScalar& rowB ); + bool balancedR( Scalar colNorm, Scalar rowNorm, + bool& isBalanced, Scalar& colB, Scalar& rowB ); public: /** @@ -139,10 +139,10 @@ class companion template< typename _Scalar, int _Deg > inline -bool companion<_Scalar,_Deg>::balanced( RealScalar colNorm, RealScalar rowNorm, - bool& isBalanced, RealScalar& colB, RealScalar& rowB ) +bool companion<_Scalar,_Deg>::balanced( Scalar colNorm, Scalar rowNorm, + bool& isBalanced, Scalar& colB, Scalar& rowB ) { - if( RealScalar(0) == colNorm || RealScalar(0) == rowNorm ){ return true; } + if( Scalar(0) == colNorm || Scalar(0) == rowNorm ){ return true; } else { //To find the balancing coefficients, if the radix is 2, @@ -150,29 +150,29 @@ bool companion<_Scalar,_Deg>::balanced( RealScalar colNorm, RealScalar rowNorm, // \f$ 2^{2\sigma-1} < rowNorm / colNorm \le 2^{2\sigma+1} \f$ // then the balancing coefficient for the row is \f$ 1/2^{\sigma} \f$ // and the balancing coefficient for the column is \f$ 2^{\sigma} \f$ - rowB = rowNorm / radix<RealScalar>(); - colB = RealScalar(1); - const RealScalar s = colNorm + rowNorm; + rowB = rowNorm / radix<Scalar>(); + colB = Scalar(1); + const Scalar s = colNorm + rowNorm; while (colNorm < rowB) { - colB *= radix<RealScalar>(); - colNorm *= radix2<RealScalar>(); + colB *= radix<Scalar>(); + colNorm *= radix2<Scalar>(); } - rowB = rowNorm * radix<RealScalar>(); + rowB = rowNorm * radix<Scalar>(); while (colNorm >= rowB) { - colB /= radix<RealScalar>(); - colNorm /= radix2<RealScalar>(); + colB /= radix<Scalar>(); + colNorm /= radix2<Scalar>(); } //This line is used to avoid insubstantial balancing - if ((rowNorm + colNorm) < RealScalar(0.95) * s * colB) + if ((rowNorm + colNorm) < Scalar(0.95) * s * colB) { isBalanced = false; - rowB = RealScalar(1) / colB; + rowB = Scalar(1) / colB; return false; } else{ @@ -182,21 +182,21 @@ bool companion<_Scalar,_Deg>::balanced( RealScalar colNorm, RealScalar rowNorm, template< typename _Scalar, int _Deg > inline -bool companion<_Scalar,_Deg>::balancedR( RealScalar colNorm, RealScalar rowNorm, - bool& isBalanced, RealScalar& colB, RealScalar& rowB ) +bool companion<_Scalar,_Deg>::balancedR( Scalar colNorm, Scalar rowNorm, + bool& isBalanced, Scalar& colB, Scalar& rowB ) { - if( RealScalar(0) == colNorm || RealScalar(0) == rowNorm ){ return true; } + if( Scalar(0) == colNorm || Scalar(0) == rowNorm ){ return true; } else { /** * Set the norm of the column and the row to the geometric mean * of the row and column norm */ - const RealScalar q = colNorm/rowNorm; + const _Scalar q = colNorm/rowNorm; if( !isApprox( q, _Scalar(1) ) ) { rowB = sqrt( colNorm/rowNorm ); - colB = RealScalar(1)/rowB; + colB = Scalar(1)/rowB; isBalanced = false; return false; @@ -219,8 +219,8 @@ void companion<_Scalar,_Deg>::balance() while( !hasConverged ) { hasConverged = true; - RealScalar colNorm,rowNorm; - RealScalar colB,rowB; + Scalar colNorm,rowNorm; + Scalar colB,rowB; //First row, first column excluding the diagonal //============================================== diff --git a/eigen/unsupported/Eigen/src/Polynomials/PolynomialSolver.h b/eigen/unsupported/Eigen/src/Polynomials/PolynomialSolver.h index 7885942..03198ec 100644 --- a/eigen/unsupported/Eigen/src/Polynomials/PolynomialSolver.h +++ b/eigen/unsupported/Eigen/src/Polynomials/PolynomialSolver.h @@ -99,7 +99,7 @@ class PolynomialSolverBase */ inline const RootType& greatestRoot() const { - std::greater<RealScalar> greater; + std::greater<Scalar> greater; return selectComplexRoot_withRespectToNorm( greater ); } @@ -108,7 +108,7 @@ class PolynomialSolverBase */ inline const RootType& smallestRoot() const { - std::less<RealScalar> less; + std::less<Scalar> less; return selectComplexRoot_withRespectToNorm( less ); } @@ -213,7 +213,7 @@ class PolynomialSolverBase bool& hasArealRoot, const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const { - std::greater<RealScalar> greater; + std::greater<Scalar> greater; return selectRealRoot_withRespectToAbsRealPart( greater, hasArealRoot, absImaginaryThreshold ); } @@ -236,7 +236,7 @@ class PolynomialSolverBase bool& hasArealRoot, const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const { - std::less<RealScalar> less; + std::less<Scalar> less; return selectRealRoot_withRespectToAbsRealPart( less, hasArealRoot, absImaginaryThreshold ); } @@ -259,7 +259,7 @@ class PolynomialSolverBase bool& hasArealRoot, const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const { - std::greater<RealScalar> greater; + std::greater<Scalar> greater; return selectRealRoot_withRespectToRealPart( greater, hasArealRoot, absImaginaryThreshold ); } @@ -282,7 +282,7 @@ class PolynomialSolverBase bool& hasArealRoot, const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const { - std::less<RealScalar> less; + std::less<Scalar> less; return selectRealRoot_withRespectToRealPart( less, hasArealRoot, absImaginaryThreshold ); } @@ -327,7 +327,7 @@ class PolynomialSolverBase * However, almost always, correct accuracy is reached even in these cases for 64bit * (double) floating types and small polynomial degree (<20). */ -template<typename _Scalar, int _Deg> +template< typename _Scalar, int _Deg > class PolynomialSolver : public PolynomialSolverBase<_Scalar,_Deg> { public: @@ -337,9 +337,7 @@ class PolynomialSolver : public PolynomialSolverBase<_Scalar,_Deg> EIGEN_POLYNOMIAL_SOLVER_BASE_INHERITED_TYPES( PS_Base ) typedef Matrix<Scalar,_Deg,_Deg> CompanionMatrixType; - typedef typename internal::conditional<NumTraits<Scalar>::IsComplex, - ComplexEigenSolver<CompanionMatrixType>, - EigenSolver<CompanionMatrixType> >::type EigenSolverType; + typedef EigenSolver<CompanionMatrixType> EigenSolverType; public: /** Computes the complex roots of a new polynomial. */ diff --git a/eigen/unsupported/Eigen/src/SparseExtra/MarketIO.h b/eigen/unsupported/Eigen/src/SparseExtra/MarketIO.h index fc70a24..cdc14f8 100644 --- a/eigen/unsupported/Eigen/src/SparseExtra/MarketIO.h +++ b/eigen/unsupported/Eigen/src/SparseExtra/MarketIO.h @@ -12,38 +12,38 @@ #define EIGEN_SPARSE_MARKET_IO_H #include <iostream> -#include <vector> namespace Eigen { namespace internal { - template <typename Scalar, typename StorageIndex> - inline void GetMarketLine (const char* line, StorageIndex& i, StorageIndex& j, Scalar& value) + template <typename Scalar> + inline bool GetMarketLine (std::stringstream& line, Index& M, Index& N, Index& i, Index& j, Scalar& value) { - std::stringstream sline(line); - sline >> i >> j >> value; + line >> i >> j >> value; + i--; + j--; + if(i>=0 && j>=0 && i<M && j<N) + { + return true; + } + else + return false; } - - template<> inline void GetMarketLine (const char* line, int& i, int& j, float& value) - { std::sscanf(line, "%d %d %g", &i, &j, &value); } - - template<> inline void GetMarketLine (const char* line, int& i, int& j, double& value) - { std::sscanf(line, "%d %d %lg", &i, &j, &value); } - - template<> inline void GetMarketLine (const char* line, int& i, int& j, std::complex<float>& value) - { std::sscanf(line, "%d %d %g %g", &i, &j, &numext::real_ref(value), &numext::imag_ref(value)); } - - template<> inline void GetMarketLine (const char* line, int& i, int& j, std::complex<double>& value) - { std::sscanf(line, "%d %d %lg %lg", &i, &j, &numext::real_ref(value), &numext::imag_ref(value)); } - - template <typename Scalar, typename StorageIndex> - inline void GetMarketLine (const char* line, StorageIndex& i, StorageIndex& j, std::complex<Scalar>& value) + template <typename Scalar> + inline bool GetMarketLine (std::stringstream& line, Index& M, Index& N, Index& i, Index& j, std::complex<Scalar>& value) { - std::stringstream sline(line); Scalar valR, valI; - sline >> i >> j >> valR >> valI; - value = std::complex<Scalar>(valR,valI); + line >> i >> j >> valR >> valI; + i--; + j--; + if(i>=0 && j>=0 && i<M && j<N) + { + value = std::complex<Scalar>(valR, valI); + return true; + } + else + return false; } template <typename RealScalar> @@ -81,13 +81,13 @@ namespace internal } } - template<typename Scalar, typename StorageIndex> - inline void PutMatrixElt(Scalar value, StorageIndex row, StorageIndex col, std::ofstream& out) + template<typename Scalar> + inline void PutMatrixElt(Scalar value, int row, int col, std::ofstream& out) { out << row << " "<< col << " " << value << "\n"; } - template<typename Scalar, typename StorageIndex> - inline void PutMatrixElt(std::complex<Scalar> value, StorageIndex row, StorageIndex col, std::ofstream& out) + template<typename Scalar> + inline void PutMatrixElt(std::complex<Scalar> value, int row, int col, std::ofstream& out) { out << row << " " << col << " " << value.real() << " " << value.imag() << "\n"; } @@ -133,20 +133,17 @@ template<typename SparseMatrixType> bool loadMarket(SparseMatrixType& mat, const std::string& filename) { typedef typename SparseMatrixType::Scalar Scalar; - typedef typename SparseMatrixType::StorageIndex StorageIndex; + typedef typename SparseMatrixType::Index Index; std::ifstream input(filename.c_str(),std::ios::in); if(!input) return false; - - char rdbuffer[4096]; - input.rdbuf()->pubsetbuf(rdbuffer, 4096); const int maxBuffersize = 2048; char buffer[maxBuffersize]; bool readsizes = false; - typedef Triplet<Scalar,StorageIndex> T; + typedef Triplet<Scalar,Index> T; std::vector<T> elements; Index M(-1), N(-1), NNZ(-1); @@ -157,36 +154,33 @@ bool loadMarket(SparseMatrixType& mat, const std::string& filename) //NOTE An appropriate test should be done on the header to get the symmetry if(buffer[0]=='%') continue; - + + std::stringstream line(buffer); + if(!readsizes) { - std::stringstream line(buffer); line >> M >> N >> NNZ; if(M > 0 && N > 0 && NNZ > 0) { readsizes = true; + //std::cout << "sizes: " << M << "," << N << "," << NNZ << "\n"; mat.resize(M,N); mat.reserve(NNZ); } } else { - StorageIndex i(-1), j(-1); + Index i(-1), j(-1); Scalar value; - internal::GetMarketLine(buffer, i, j, value); - - i--; - j--; - if(i>=0 && j>=0 && i<M && j<N) + if( internal::GetMarketLine(line, M, N, i, j, value) ) { - ++count; + ++ count; elements.push_back(T(i,j,value)); } - else + else std::cerr << "Invalid read: " << i << "," << j << "\n"; } } - mat.setFromTriplets(elements.begin(), elements.end()); if(count!=NNZ) std::cerr << count << "!=" << NNZ << "\n"; @@ -231,13 +225,12 @@ template<typename SparseMatrixType> bool saveMarket(const SparseMatrixType& mat, const std::string& filename, int sym = 0) { typedef typename SparseMatrixType::Scalar Scalar; - typedef typename SparseMatrixType::RealScalar RealScalar; std::ofstream out(filename.c_str(),std::ios::out); if(!out) return false; out.flags(std::ios_base::scientific); - out.precision(std::numeric_limits<RealScalar>::digits10 + 2); + out.precision(64); std::string header; internal::putMarketHeader<Scalar>(header, sym); out << header << std::endl; @@ -248,6 +241,7 @@ bool saveMarket(const SparseMatrixType& mat, const std::string& filename, int sy { ++ count; internal::PutMatrixElt(it.value(), it.row()+1, it.col()+1, out); + // out << it.row()+1 << " " << it.col()+1 << " " << it.value() << "\n"; } out.close(); return true; @@ -256,14 +250,13 @@ bool saveMarket(const SparseMatrixType& mat, const std::string& filename, int sy template<typename VectorType> bool saveMarketVector (const VectorType& vec, const std::string& filename) { - typedef typename VectorType::Scalar Scalar; - typedef typename VectorType::RealScalar RealScalar; + typedef typename VectorType::Scalar Scalar; std::ofstream out(filename.c_str(),std::ios::out); if(!out) return false; out.flags(std::ios_base::scientific); - out.precision(std::numeric_limits<RealScalar>::digits10 + 2); + out.precision(64); if(internal::is_same<Scalar, std::complex<float> >::value || internal::is_same<Scalar, std::complex<double> >::value) out << "%%MatrixMarket matrix array complex general\n"; else diff --git a/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h index 369ad97..f524d71 100644 --- a/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h +++ b/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h @@ -122,8 +122,8 @@ struct lgamma_impl<float> { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float run(float x) { #if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) - int dummy; - return ::lgammaf_r(x, &dummy); + int signgam; + return ::lgammaf_r(x, &signgam); #else return ::lgammaf(x); #endif @@ -135,8 +135,8 @@ struct lgamma_impl<double> { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(double x) { #if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) - int dummy; - return ::lgamma_r(x, &dummy); + int signgam; + return ::lgamma_r(x, &signgam); #else return ::lgamma(x); #endif diff --git a/eigen/unsupported/doc/examples/EulerAngles.cpp b/eigen/unsupported/doc/examples/EulerAngles.cpp index 3f8ca8c..1ef6aee 100644 --- a/eigen/unsupported/doc/examples/EulerAngles.cpp +++ b/eigen/unsupported/doc/examples/EulerAngles.cpp @@ -23,7 +23,7 @@ int main() // Some Euler angles representation that our plane use. EulerAnglesZYZd planeAngles(0.78474, 0.5271, -0.513794); - MyArmyAngles planeAnglesInMyArmyAngles(planeAngles); + MyArmyAngles planeAnglesInMyArmyAngles = MyArmyAngles::FromRotation<true, false, false>(planeAngles); std::cout << "vehicle angles(MyArmy): " << vehicleAngles << std::endl; std::cout << "plane angles(ZYZ): " << planeAngles << std::endl; @@ -37,7 +37,7 @@ int main() Quaterniond planeRotated = AngleAxisd(-0.342, Vector3d::UnitY()) * planeAngles; planeAngles = planeRotated; - planeAnglesInMyArmyAngles = planeRotated; + planeAnglesInMyArmyAngles = MyArmyAngles::FromRotation<true, false, false>(planeRotated); std::cout << "new plane angles(ZYZ): " << planeAngles << std::endl; std::cout << "new plane angles(MyArmy): " << planeAnglesInMyArmyAngles << std::endl; diff --git a/eigen/unsupported/test/CMakeLists.txt b/eigen/unsupported/test/CMakeLists.txt index 003c9de..b5fa1c8 100644 --- a/eigen/unsupported/test/CMakeLists.txt +++ b/eigen/unsupported/test/CMakeLists.txt @@ -21,17 +21,6 @@ include_directories(../../test ../../unsupported ../../Eigen find_package (Threads) -find_package(Xsmm) -if(XSMM_FOUND) - add_definitions("-DEIGEN_USE_LIBXSMM") - include_directories(${XSMM_INCLUDES}) - link_directories(${XSMM_LIBRARIES}) - set(EXTERNAL_LIBS ${EXTERNAL_LIBS} xsmm) - ei_add_property(EIGEN_TESTED_BACKENDS "Xsmm, ") -else(XSMM_FOUND) - ei_add_property(EIGEN_MISSING_BACKENDS "Xsmm, ") -endif(XSMM_FOUND) - find_package(GoogleHash) if(GOOGLEHASH_FOUND) add_definitions("-DEIGEN_GOOGLEHASH_SUPPORT") @@ -157,16 +146,6 @@ if(EIGEN_TEST_CXX11) ei_add_test_sycl(cxx11_tensor_broadcast_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_device_sycl "-std=c++11") ei_add_test_sycl(cxx11_tensor_reduction_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_morphing_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_shuffling_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_padding_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_builtins_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_contract_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_concatenation_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_reverse_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_convolution_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_striding_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_chipping_sycl "-std=c++11") endif(EIGEN_TEST_SYCL) # It should be safe to always run these tests as there is some fallback code for # older compiler that don't support cxx11. diff --git a/eigen/unsupported/test/EulerAngles.cpp b/eigen/unsupported/test/EulerAngles.cpp index 79ee728..a8cb528 100644 --- a/eigen/unsupported/test/EulerAngles.cpp +++ b/eigen/unsupported/test/EulerAngles.cpp @@ -13,219 +13,146 @@ using namespace Eigen; -// Unfortunately, we need to specialize it in order to work. (We could add it in main.h test framework) -template <typename Scalar, class System> -bool verifyIsApprox(const Eigen::EulerAngles<Scalar, System>& a, const Eigen::EulerAngles<Scalar, System>& b) -{ - return verifyIsApprox(a.angles(), b.angles()); -} - -// Verify that x is in the approxed range [a, b] -#define VERIFY_APPROXED_RANGE(a, x, b) \ - do { \ - VERIFY_IS_APPROX_OR_LESS_THAN(a, x); \ - VERIFY_IS_APPROX_OR_LESS_THAN(x, b); \ - } while(0) - -const char X = EULER_X; -const char Y = EULER_Y; -const char Z = EULER_Z; - -template<typename Scalar, class EulerSystem> -void verify_euler(const EulerAngles<Scalar, EulerSystem>& e) +template<typename EulerSystem, typename Scalar> +void verify_euler_ranged(const Matrix<Scalar,3,1>& ea, + bool positiveRangeAlpha, bool positiveRangeBeta, bool positiveRangeGamma) { typedef EulerAngles<Scalar, EulerSystem> EulerAnglesType; typedef Matrix<Scalar,3,3> Matrix3; typedef Matrix<Scalar,3,1> Vector3; typedef Quaternion<Scalar> QuaternionType; typedef AngleAxis<Scalar> AngleAxisType; + using std::abs; - const Scalar ONE = Scalar(1); - const Scalar HALF_PI = Scalar(EIGEN_PI / 2); - const Scalar PI = Scalar(EIGEN_PI); + Scalar alphaRangeStart, alphaRangeEnd; + Scalar betaRangeStart, betaRangeEnd; + Scalar gammaRangeStart, gammaRangeEnd; - // It's very important calc the acceptable precision depending on the distance from the pole. - const Scalar longitudeRadius = std::abs( - EulerSystem::IsTaitBryan ? - std::cos(e.beta()) : - std::sin(e.beta()) - ); - Scalar precision = test_precision<Scalar>() / longitudeRadius; + if (positiveRangeAlpha) + { + alphaRangeStart = Scalar(0); + alphaRangeEnd = Scalar(2 * EIGEN_PI); + } + else + { + alphaRangeStart = -Scalar(EIGEN_PI); + alphaRangeEnd = Scalar(EIGEN_PI); + } - Scalar betaRangeStart, betaRangeEnd; - if (EulerSystem::IsTaitBryan) + if (positiveRangeBeta) + { + betaRangeStart = Scalar(0); + betaRangeEnd = Scalar(2 * EIGEN_PI); + } + else + { + betaRangeStart = -Scalar(EIGEN_PI); + betaRangeEnd = Scalar(EIGEN_PI); + } + + if (positiveRangeGamma) { - betaRangeStart = -HALF_PI; - betaRangeEnd = HALF_PI; + gammaRangeStart = Scalar(0); + gammaRangeEnd = Scalar(2 * EIGEN_PI); } else { - if (!EulerSystem::IsBetaOpposite) - { - betaRangeStart = 0; - betaRangeEnd = PI; - } - else - { - betaRangeStart = -PI; - betaRangeEnd = 0; - } + gammaRangeStart = -Scalar(EIGEN_PI); + gammaRangeEnd = Scalar(EIGEN_PI); } + const int i = EulerSystem::AlphaAxisAbs - 1; + const int j = EulerSystem::BetaAxisAbs - 1; + const int k = EulerSystem::GammaAxisAbs - 1; + + const int iFactor = EulerSystem::IsAlphaOpposite ? -1 : 1; + const int jFactor = EulerSystem::IsBetaOpposite ? -1 : 1; + const int kFactor = EulerSystem::IsGammaOpposite ? -1 : 1; + const Vector3 I = EulerAnglesType::AlphaAxisVector(); const Vector3 J = EulerAnglesType::BetaAxisVector(); const Vector3 K = EulerAnglesType::GammaAxisVector(); - // Is approx checks - VERIFY(e.isApprox(e)); - VERIFY_IS_APPROX(e, e); - VERIFY_IS_NOT_APPROX(e, EulerAnglesType(e.alpha() + ONE, e.beta() + ONE, e.gamma() + ONE)); - - const Matrix3 m(e); - VERIFY_IS_APPROX(Scalar(m.determinant()), ONE); - - EulerAnglesType ebis(m); + EulerAnglesType e(ea[0], ea[1], ea[2]); - // When no roll(acting like polar representation), we have the best precision. - // One of those cases is when the Euler angles are on the pole, and because it's singular case, - // the computation returns no roll. - if (ebis.beta() == 0) - precision = test_precision<Scalar>(); + Matrix3 m(e); + Vector3 eabis = EulerAnglesType(m, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma).angles(); // Check that eabis in range - VERIFY_APPROXED_RANGE(-PI, ebis.alpha(), PI); - VERIFY_APPROXED_RANGE(betaRangeStart, ebis.beta(), betaRangeEnd); - VERIFY_APPROXED_RANGE(-PI, ebis.gamma(), PI); - - const Matrix3 mbis(AngleAxisType(ebis.alpha(), I) * AngleAxisType(ebis.beta(), J) * AngleAxisType(ebis.gamma(), K)); - VERIFY_IS_APPROX(Scalar(mbis.determinant()), ONE); - VERIFY_IS_APPROX(mbis, ebis.toRotationMatrix()); - /*std::cout << "===================\n" << - "e: " << e << std::endl << - "eabis: " << eabis.transpose() << std::endl << - "m: " << m << std::endl << - "mbis: " << mbis << std::endl << - "X: " << (m * Vector3::UnitX()).transpose() << std::endl << - "X: " << (mbis * Vector3::UnitX()).transpose() << std::endl;*/ - VERIFY(m.isApprox(mbis, precision)); - - // Test if ea and eabis are the same - // Need to check both singular and non-singular cases - // There are two singular cases. - // 1. When I==K and sin(ea(1)) == 0 - // 2. When I!=K and cos(ea(1)) == 0 - - // TODO: Make this test work well, and use range saturation function. - /*// If I==K, and ea[1]==0, then there no unique solution. - // The remark apply in the case where I!=K, and |ea[1]| is close to +-pi/2. - if( (i!=k || ea[1]!=0) && (i==k || !internal::isApprox(abs(ea[1]),Scalar(EIGEN_PI/2),test_precision<Scalar>())) ) - VERIFY_IS_APPROX(ea, eabis);*/ + VERIFY(alphaRangeStart <= eabis[0] && eabis[0] <= alphaRangeEnd); + VERIFY(betaRangeStart <= eabis[1] && eabis[1] <= betaRangeEnd); + VERIFY(gammaRangeStart <= eabis[2] && eabis[2] <= gammaRangeEnd); - // Quaternions - const QuaternionType q(e); - ebis = q; - const QuaternionType qbis(ebis); - VERIFY(internal::isApprox<Scalar>(std::abs(q.dot(qbis)), ONE, precision)); - //VERIFY_IS_APPROX(eabis, eabis2);// Verify that the euler angles are still the same + Vector3 eabis2 = m.eulerAngles(i, j, k); - // A suggestion for simple product test when will be supported. - /*EulerAnglesType e2(PI/2, PI/2, PI/2); - Matrix3 m2(e2); - VERIFY_IS_APPROX(e*e2, m*m2);*/ -} - -template<signed char A, signed char B, signed char C, typename Scalar> -void verify_euler_vec(const Matrix<Scalar,3,1>& ea) -{ - verify_euler(EulerAngles<Scalar, EulerSystem<A, B, C> >(ea[0], ea[1], ea[2])); -} - -template<signed char A, signed char B, signed char C, typename Scalar> -void verify_euler_all_neg(const Matrix<Scalar,3,1>& ea) -{ - verify_euler_vec<+A,+B,+C>(ea); - verify_euler_vec<+A,+B,-C>(ea); - verify_euler_vec<+A,-B,+C>(ea); - verify_euler_vec<+A,-B,-C>(ea); + // Invert the relevant axes + eabis2[0] *= iFactor; + eabis2[1] *= jFactor; + eabis2[2] *= kFactor; - verify_euler_vec<-A,+B,+C>(ea); - verify_euler_vec<-A,+B,-C>(ea); - verify_euler_vec<-A,-B,+C>(ea); - verify_euler_vec<-A,-B,-C>(ea); -} - -template<typename Scalar> void check_all_var(const Matrix<Scalar,3,1>& ea) -{ - verify_euler_all_neg<X,Y,Z>(ea); - verify_euler_all_neg<X,Y,X>(ea); - verify_euler_all_neg<X,Z,Y>(ea); - verify_euler_all_neg<X,Z,X>(ea); + // Saturate the angles to the correct range + if (positiveRangeAlpha && (eabis2[0] < 0)) + eabis2[0] += Scalar(2 * EIGEN_PI); + if (positiveRangeBeta && (eabis2[1] < 0)) + eabis2[1] += Scalar(2 * EIGEN_PI); + if (positiveRangeGamma && (eabis2[2] < 0)) + eabis2[2] += Scalar(2 * EIGEN_PI); - verify_euler_all_neg<Y,Z,X>(ea); - verify_euler_all_neg<Y,Z,Y>(ea); - verify_euler_all_neg<Y,X,Z>(ea); - verify_euler_all_neg<Y,X,Y>(ea); + VERIFY_IS_APPROX(eabis, eabis2);// Verify that our estimation is the same as m.eulerAngles() is - verify_euler_all_neg<Z,X,Y>(ea); - verify_euler_all_neg<Z,X,Z>(ea); - verify_euler_all_neg<Z,Y,X>(ea); - verify_euler_all_neg<Z,Y,Z>(ea); -} - -template<typename Scalar> void check_singular_cases(const Scalar& singularBeta) -{ - typedef Matrix<Scalar,3,1> Vector3; - const Scalar PI = Scalar(EIGEN_PI); + Matrix3 mbis(AngleAxisType(eabis[0], I) * AngleAxisType(eabis[1], J) * AngleAxisType(eabis[2], K)); + VERIFY_IS_APPROX(m, mbis); - for (Scalar epsilon = NumTraits<Scalar>::epsilon(); epsilon < 1; epsilon *= Scalar(1.2)) + // Tests that are only relevant for no possitive range + if (!(positiveRangeAlpha || positiveRangeBeta || positiveRangeGamma)) { - check_all_var(Vector3(PI/4, singularBeta, PI/3)); - check_all_var(Vector3(PI/4, singularBeta - epsilon, PI/3)); - check_all_var(Vector3(PI/4, singularBeta - Scalar(1.5)*epsilon, PI/3)); - check_all_var(Vector3(PI/4, singularBeta - 2*epsilon, PI/3)); - check_all_var(Vector3(PI*Scalar(0.8), singularBeta - epsilon, Scalar(0.9)*PI)); - check_all_var(Vector3(PI*Scalar(-0.9), singularBeta + epsilon, PI*Scalar(0.3))); - check_all_var(Vector3(PI*Scalar(-0.6), singularBeta + Scalar(1.5)*epsilon, PI*Scalar(0.3))); - check_all_var(Vector3(PI*Scalar(-0.5), singularBeta + 2*epsilon, PI*Scalar(0.4))); - check_all_var(Vector3(PI*Scalar(0.9), singularBeta + epsilon, Scalar(0.8)*PI)); + /* If I==K, and ea[1]==0, then there no unique solution. */ + /* The remark apply in the case where I!=K, and |ea[1]| is close to pi/2. */ + if( (i!=k || ea[1]!=0) && (i==k || !internal::isApprox(abs(ea[1]),Scalar(EIGEN_PI/2),test_precision<Scalar>())) ) + VERIFY((ea-eabis).norm() <= test_precision<Scalar>()); + + // approx_or_less_than does not work for 0 + VERIFY(0 < eabis[0] || test_isMuchSmallerThan(eabis[0], Scalar(1))); } - // This one for sanity, it had a problem with near pole cases in float scalar. - check_all_var(Vector3(PI*Scalar(0.8), singularBeta - Scalar(1E-6), Scalar(0.9)*PI)); + // Quaternions + QuaternionType q(e); + eabis = EulerAnglesType(q, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma).angles(); + VERIFY_IS_APPROX(eabis, eabis2);// Verify that the euler angles are still the same } -template<typename Scalar> void eulerangles_manual() +template<typename EulerSystem, typename Scalar> +void verify_euler(const Matrix<Scalar,3,1>& ea) { - typedef Matrix<Scalar,3,1> Vector3; - const Vector3 Zero = Vector3::Zero(); - const Scalar PI = Scalar(EIGEN_PI); - - check_all_var(Zero); - - // singular cases - check_singular_cases(PI/2); - check_singular_cases(-PI/2); - - check_singular_cases(Scalar(0)); - check_singular_cases(Scalar(-0)); - - check_singular_cases(PI); - check_singular_cases(-PI); - - // non-singular cases - VectorXd alpha = VectorXd::LinSpaced(Eigen::Sequential, 20, Scalar(-0.99) * PI, PI); - VectorXd beta = VectorXd::LinSpaced(Eigen::Sequential, 20, Scalar(-0.49) * PI, Scalar(0.49) * PI); - VectorXd gamma = VectorXd::LinSpaced(Eigen::Sequential, 20, Scalar(-0.99) * PI, PI); - for (int i = 0; i < alpha.size(); ++i) { - for (int j = 0; j < beta.size(); ++j) { - for (int k = 0; k < gamma.size(); ++k) { - check_all_var(Vector3d(alpha(i), beta(j), gamma(k))); - } - } - } + verify_euler_ranged<EulerSystem>(ea, false, false, false); + verify_euler_ranged<EulerSystem>(ea, false, false, true); + verify_euler_ranged<EulerSystem>(ea, false, true, false); + verify_euler_ranged<EulerSystem>(ea, false, true, true); + verify_euler_ranged<EulerSystem>(ea, true, false, false); + verify_euler_ranged<EulerSystem>(ea, true, false, true); + verify_euler_ranged<EulerSystem>(ea, true, true, false); + verify_euler_ranged<EulerSystem>(ea, true, true, true); } -template<typename Scalar> void eulerangles_rand() +template<typename Scalar> void check_all_var(const Matrix<Scalar,3,1>& ea) +{ + verify_euler<EulerSystemXYZ>(ea); + verify_euler<EulerSystemXYX>(ea); + verify_euler<EulerSystemXZY>(ea); + verify_euler<EulerSystemXZX>(ea); + + verify_euler<EulerSystemYZX>(ea); + verify_euler<EulerSystemYZY>(ea); + verify_euler<EulerSystemYXZ>(ea); + verify_euler<EulerSystemYXY>(ea); + + verify_euler<EulerSystemZXY>(ea); + verify_euler<EulerSystemZXZ>(ea); + verify_euler<EulerSystemZYX>(ea); + verify_euler<EulerSystemZYZ>(ea); +} + +template<typename Scalar> void eulerangles() { typedef Matrix<Scalar,3,3> Matrix3; typedef Matrix<Scalar,3,1> Vector3; @@ -274,19 +201,8 @@ template<typename Scalar> void eulerangles_rand() void test_EulerAngles() { - // Simple cast test - EulerAnglesXYZd onesEd(1, 1, 1); - EulerAnglesXYZf onesEf = onesEd.cast<float>(); - VERIFY_IS_APPROX(onesEd, onesEf.cast<double>()); - - CALL_SUBTEST_1( eulerangles_manual<float>() ); - CALL_SUBTEST_2( eulerangles_manual<double>() ); - for(int i = 0; i < g_repeat; i++) { - CALL_SUBTEST_3( eulerangles_rand<float>() ); - CALL_SUBTEST_4( eulerangles_rand<double>() ); + CALL_SUBTEST_1( eulerangles<float>() ); + CALL_SUBTEST_2( eulerangles<double>() ); } - - // TODO: Add tests for auto diff - // TODO: Add tests for complex numbers } diff --git a/eigen/unsupported/test/autodiff_scalar.cpp b/eigen/unsupported/test/autodiff_scalar.cpp index 4df2f5c..9cf1128 100644 --- a/eigen/unsupported/test/autodiff_scalar.cpp +++ b/eigen/unsupported/test/autodiff_scalar.cpp @@ -72,6 +72,20 @@ template<typename Scalar> void check_hyperbolic_functions() VERIFY_IS_APPROX(res3.derivatives().x(), Scalar(0.339540557256150)); } +template <typename Scalar> +void check_limits_specialization() +{ + typedef Eigen::Matrix<Scalar, 1, 1> Deriv; + typedef Eigen::AutoDiffScalar<Deriv> AD; + + typedef std::numeric_limits<AD> A; + typedef std::numeric_limits<Scalar> B; + +#if EIGEN_HAS_CXX11 + VERIFY(bool(std::is_base_of<B, A>::value)); +#endif +} + void test_autodiff_scalar() { for(int i = 0; i < g_repeat; i++) { @@ -79,5 +93,6 @@ void test_autodiff_scalar() CALL_SUBTEST_2( check_atan2<double>() ); CALL_SUBTEST_3( check_hyperbolic_functions<float>() ); CALL_SUBTEST_4( check_hyperbolic_functions<double>() ); + CALL_SUBTEST_5( check_limits_specialization<double>()); } } diff --git a/eigen/unsupported/test/cxx11_non_blocking_thread_pool.cpp b/eigen/unsupported/test/cxx11_non_blocking_thread_pool.cpp index 48cd2d4..5f9bb93 100644 --- a/eigen/unsupported/test/cxx11_non_blocking_thread_pool.cpp +++ b/eigen/unsupported/test/cxx11_non_blocking_thread_pool.cpp @@ -11,7 +11,6 @@ #define EIGEN_USE_THREADS #include "main.h" #include "Eigen/CXX11/ThreadPool" -#include "Eigen/CXX11/Tensor" static void test_create_destroy_empty_pool() { @@ -23,11 +22,11 @@ static void test_create_destroy_empty_pool() } -static void test_parallelism(bool allow_spinning) +static void test_parallelism() { // Test we never-ever fail to match available tasks with idle threads. const int kThreads = 16; // code below expects that this is a multiple of 4 - NonBlockingThreadPool tp(kThreads, allow_spinning); + NonBlockingThreadPool tp(kThreads); VERIFY_IS_EQUAL(tp.NumThreads(), kThreads); VERIFY_IS_EQUAL(tp.CurrentThreadId(), -1); for (int iter = 0; iter < 100; ++iter) { @@ -101,25 +100,8 @@ static void test_parallelism(bool allow_spinning) } } - -static void test_cancel() -{ - NonBlockingThreadPool tp(2); - - // Schedule a large number of closure that each sleeps for one second. This - // will keep the thread pool busy for much longer than the default test timeout. - for (int i = 0; i < 1000; ++i) { - tp.Schedule([]() { EIGEN_SLEEP(2000); }); - } - - // Cancel the processing of all the closures that are still pending. - tp.Cancel(); -} - void test_cxx11_non_blocking_thread_pool() { CALL_SUBTEST(test_create_destroy_empty_pool()); - CALL_SUBTEST(test_parallelism(true)); - CALL_SUBTEST(test_parallelism(false)); - CALL_SUBTEST(test_cancel()); + CALL_SUBTEST(test_parallelism()); } diff --git a/eigen/unsupported/test/cxx11_tensor_broadcast_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_broadcast_sycl.cpp index 21fdfca..7201bfe 100644 --- a/eigen/unsupported/test/cxx11_tensor_broadcast_sycl.cpp +++ b/eigen/unsupported/test/cxx11_tensor_broadcast_sycl.cpp @@ -14,7 +14,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_broadcast_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_SYCL #include "main.h" @@ -25,99 +25,39 @@ using Eigen::SyclDevice; using Eigen::Tensor; using Eigen::TensorMap; -template <typename DataType, int DataLayout, typename IndexType> -static void test_broadcast_sycl_fixed(const Eigen::SyclDevice &sycl_device){ - - // BROADCAST test: - IndexType inDim1=2; - IndexType inDim2=3; - IndexType inDim3=5; - IndexType inDim4=7; - IndexType bDim1=2; - IndexType bDim2=3; - IndexType bDim3=1; - IndexType bDim4=4; - array<IndexType, 4> in_range = {{inDim1, inDim2, inDim3, inDim4}}; - array<IndexType, 4> broadcasts = {{bDim1, bDim2, bDim3, bDim4}}; - array<IndexType, 4> out_range; // = in_range * broadcasts - for (size_t i = 0; i < out_range.size(); ++i) - out_range[i] = in_range[i] * broadcasts[i]; - - Tensor<DataType, 4, DataLayout, IndexType> input(in_range); - Tensor<DataType, 4, DataLayout, IndexType> out(out_range); - - for (size_t i = 0; i < in_range.size(); ++i) - VERIFY_IS_EQUAL(out.dimension(i), out_range[i]); - - - for (IndexType i = 0; i < input.size(); ++i) - input(i) = static_cast<DataType>(i); - - DataType * gpu_in_data = static_cast<DataType*>(sycl_device.allocate(input.dimensions().TotalSize()*sizeof(DataType))); - DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType))); - - TensorMap<TensorFixedSize<DataType, Sizes<2, 3, 5, 7>, DataLayout, IndexType>> gpu_in(gpu_in_data, in_range); - TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu_out(gpu_out_data, out_range); - sycl_device.memcpyHostToDevice(gpu_in_data, input.data(),(input.dimensions().TotalSize())*sizeof(DataType)); - gpu_out.device(sycl_device) = gpu_in.broadcast(broadcasts); - sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType)); - - for (IndexType i = 0; i < inDim1*bDim1; ++i) { - for (IndexType j = 0; j < inDim2*bDim2; ++j) { - for (IndexType k = 0; k < inDim3*bDim3; ++k) { - for (IndexType l = 0; l < inDim4*bDim4; ++l) { - VERIFY_IS_APPROX(input(i%2,j%3,k%5,l%7), out(i,j,k,l)); - } - } - } - } - printf("Broadcast Test with fixed size Passed\n"); - sycl_device.deallocate(gpu_in_data); - sycl_device.deallocate(gpu_out_data); -} - -template <typename DataType, int DataLayout, typename IndexType> static void test_broadcast_sycl(const Eigen::SyclDevice &sycl_device){ // BROADCAST test: - IndexType inDim1=2; - IndexType inDim2=3; - IndexType inDim3=5; - IndexType inDim4=7; - IndexType bDim1=2; - IndexType bDim2=3; - IndexType bDim3=1; - IndexType bDim4=4; - array<IndexType, 4> in_range = {{inDim1, inDim2, inDim3, inDim4}}; - array<IndexType, 4> broadcasts = {{bDim1, bDim2, bDim3, bDim4}}; - array<IndexType, 4> out_range; // = in_range * broadcasts + array<int, 4> in_range = {{2, 3, 5, 7}}; + array<int, 4> broadcasts = {{2, 3, 1, 4}}; + array<int, 4> out_range; // = in_range * broadcasts for (size_t i = 0; i < out_range.size(); ++i) out_range[i] = in_range[i] * broadcasts[i]; - Tensor<DataType, 4, DataLayout, IndexType> input(in_range); - Tensor<DataType, 4, DataLayout, IndexType> out(out_range); + Tensor<float, 4> input(in_range); + Tensor<float, 4> out(out_range); for (size_t i = 0; i < in_range.size(); ++i) VERIFY_IS_EQUAL(out.dimension(i), out_range[i]); - for (IndexType i = 0; i < input.size(); ++i) - input(i) = static_cast<DataType>(i); + for (int i = 0; i < input.size(); ++i) + input(i) = static_cast<float>(i); - DataType * gpu_in_data = static_cast<DataType*>(sycl_device.allocate(input.dimensions().TotalSize()*sizeof(DataType))); - DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType))); + float * gpu_in_data = static_cast<float*>(sycl_device.allocate(input.dimensions().TotalSize()*sizeof(float))); + float * gpu_out_data = static_cast<float*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(float))); - TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu_in(gpu_in_data, in_range); - TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu_out(gpu_out_data, out_range); - sycl_device.memcpyHostToDevice(gpu_in_data, input.data(),(input.dimensions().TotalSize())*sizeof(DataType)); + TensorMap<Tensor<float, 4>> gpu_in(gpu_in_data, in_range); + TensorMap<Tensor<float, 4>> gpu_out(gpu_out_data, out_range); + sycl_device.memcpyHostToDevice(gpu_in_data, input.data(),(input.dimensions().TotalSize())*sizeof(float)); gpu_out.device(sycl_device) = gpu_in.broadcast(broadcasts); - sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float)); - for (IndexType i = 0; i < inDim1*bDim1; ++i) { - for (IndexType j = 0; j < inDim2*bDim2; ++j) { - for (IndexType k = 0; k < inDim3*bDim3; ++k) { - for (IndexType l = 0; l < inDim4*bDim4; ++l) { - VERIFY_IS_APPROX(input(i%inDim1,j%inDim2,k%inDim3,l%inDim4), out(i,j,k,l)); + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 9; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 28; ++l) { + VERIFY_IS_APPROX(input(i%2,j%3,k%5,l%7), out(i,j,k,l)); } } } @@ -127,18 +67,8 @@ static void test_broadcast_sycl(const Eigen::SyclDevice &sycl_device){ sycl_device.deallocate(gpu_out_data); } -template<typename DataType> void sycl_broadcast_test_per_device(const cl::sycl::device& d){ - std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl; - QueueInterface queueInterface(d); - auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_broadcast_sycl<DataType, RowMajor, int64_t>(sycl_device); - test_broadcast_sycl<DataType, ColMajor, int64_t>(sycl_device); - test_broadcast_sycl_fixed<DataType, RowMajor, int64_t>(sycl_device); - test_broadcast_sycl_fixed<DataType, ColMajor, int64_t>(sycl_device); -} - void test_cxx11_tensor_broadcast_sycl() { - for (const auto& device :Eigen::get_sycl_supported_devices()) { - CALL_SUBTEST(sycl_broadcast_test_per_device<float>(device)); - } + cl::sycl::gpu_selector s; + Eigen::SyclDevice sycl_device(s); + CALL_SUBTEST(test_broadcast_sycl(sycl_device)); } diff --git a/eigen/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_builtins_sycl.cpp deleted file mode 100644 index 400a31d..0000000 --- a/eigen/unsupported/test/cxx11_tensor_builtins_sycl.cpp +++ /dev/null @@ -1,267 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: <eigen@codeplay.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_builtins_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t -#define EIGEN_USE_SYCL - -#include "main.h" -#include <unsupported/Eigen/CXX11/Tensor> - -using Eigen::array; -using Eigen::SyclDevice; -using Eigen::Tensor; -using Eigen::TensorMap; - -namespace std { -template <typename T> T rsqrt(T x) { return 1 / std::sqrt(x); } -template <typename T> T square(T x) { return x * x; } -template <typename T> T cube(T x) { return x * x * x; } -template <typename T> T inverse(T x) { return 1 / x; } -} - -#define TEST_UNARY_BUILTINS_FOR_SCALAR(FUNC, SCALAR, OPERATOR, Layout) \ - { \ - /* out OPERATOR in.FUNC() */ \ - Tensor<SCALAR, 3, Layout, int64_t> in(tensorRange); \ - Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \ - in = in.random() + static_cast<SCALAR>(0.01); \ - out = out.random() + static_cast<SCALAR>(0.01); \ - Tensor<SCALAR, 3, Layout, int64_t> reference(out); \ - SCALAR *gpu_data = static_cast<SCALAR *>( \ - sycl_device.allocate(in.size() * sizeof(SCALAR))); \ - SCALAR *gpu_data_out = static_cast<SCALAR *>( \ - sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu(gpu_data, tensorRange); \ - TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \ - sycl_device.memcpyHostToDevice(gpu_data, in.data(), \ - (in.size()) * sizeof(SCALAR)); \ - sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), \ - (out.size()) * sizeof(SCALAR)); \ - gpu_out.device(sycl_device) OPERATOR gpu.FUNC(); \ - sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ - (out.size()) * sizeof(SCALAR)); \ - for (int64_t i = 0; i < out.size(); ++i) { \ - SCALAR ver = reference(i); \ - ver OPERATOR std::FUNC(in(i)); \ - VERIFY_IS_APPROX(out(i), ver); \ - } \ - sycl_device.deallocate(gpu_data); \ - sycl_device.deallocate(gpu_data_out); \ - } \ - { \ - /* out OPERATOR out.FUNC() */ \ - Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \ - out = out.random() + static_cast<SCALAR>(0.01); \ - Tensor<SCALAR, 3, Layout, int64_t> reference(out); \ - SCALAR *gpu_data_out = static_cast<SCALAR *>( \ - sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \ - sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), \ - (out.size()) * sizeof(SCALAR)); \ - gpu_out.device(sycl_device) OPERATOR gpu_out.FUNC(); \ - sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ - (out.size()) * sizeof(SCALAR)); \ - for (int64_t i = 0; i < out.size(); ++i) { \ - SCALAR ver = reference(i); \ - ver OPERATOR std::FUNC(reference(i)); \ - VERIFY_IS_APPROX(out(i), ver); \ - } \ - sycl_device.deallocate(gpu_data_out); \ - } - -#define TEST_UNARY_BUILTINS_OPERATOR(SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(sqrt, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(rsqrt, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(square, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(cube, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(inverse, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(tanh, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(exp, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(expm1, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(log, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(ceil, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(floor, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(round, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(log1p, SCALAR, OPERATOR , Layout) - -#define TEST_IS_THAT_RETURNS_BOOL(SCALAR, FUNC, Layout) \ - { \ - /* out = in.FUNC() */ \ - Tensor<SCALAR, 3, Layout, int64_t> in(tensorRange); \ - Tensor<bool, 3, Layout, int64_t> out(tensorRange); \ - in = in.random() + static_cast<SCALAR>(0.01); \ - SCALAR *gpu_data = static_cast<SCALAR *>( \ - sycl_device.allocate(in.size() * sizeof(SCALAR))); \ - bool *gpu_data_out = \ - static_cast<bool *>(sycl_device.allocate(out.size() * sizeof(bool))); \ - TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu(gpu_data, tensorRange); \ - TensorMap<Tensor<bool, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \ - sycl_device.memcpyHostToDevice(gpu_data, in.data(), \ - (in.size()) * sizeof(SCALAR)); \ - gpu_out.device(sycl_device) = gpu.FUNC(); \ - sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ - (out.size()) * sizeof(bool)); \ - for (int64_t i = 0; i < out.size(); ++i) { \ - VERIFY_IS_EQUAL(out(i), std::FUNC(in(i))); \ - } \ - sycl_device.deallocate(gpu_data); \ - sycl_device.deallocate(gpu_data_out); \ - } - -#define TEST_UNARY_BUILTINS(SCALAR, Layout) \ - TEST_UNARY_BUILTINS_OPERATOR(SCALAR, +=, Layout) \ - TEST_UNARY_BUILTINS_OPERATOR(SCALAR, =, Layout) \ - TEST_IS_THAT_RETURNS_BOOL(SCALAR, isnan, Layout) \ - TEST_IS_THAT_RETURNS_BOOL(SCALAR, isfinite, Layout) \ - TEST_IS_THAT_RETURNS_BOOL(SCALAR, isinf, Layout) - -static void test_builtin_unary_sycl(const Eigen::SyclDevice &sycl_device) { - int64_t sizeDim1 = 10; - int64_t sizeDim2 = 10; - int64_t sizeDim3 = 10; - array<int64_t, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; - - TEST_UNARY_BUILTINS(float, RowMajor) - TEST_UNARY_BUILTINS(float, ColMajor) -} - -namespace std { -template <typename T> T cwiseMax(T x, T y) { return std::max(x, y); } -template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); } -} - -#define TEST_BINARY_BUILTINS_FUNC(SCALAR, FUNC, Layout) \ - { \ - /* out = in_1.FUNC(in_2) */ \ - Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange); \ - Tensor<SCALAR, 3, Layout, int64_t> in_2(tensorRange); \ - Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \ - in_1 = in_1.random() + static_cast<SCALAR>(0.01); \ - in_2 = in_2.random() + static_cast<SCALAR>(0.01); \ - Tensor<SCALAR, 3, Layout, int64_t> reference(out); \ - SCALAR *gpu_data_1 = static_cast<SCALAR *>( \ - sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \ - SCALAR *gpu_data_2 = static_cast<SCALAR *>( \ - sycl_device.allocate(in_2.size() * sizeof(SCALAR))); \ - SCALAR *gpu_data_out = static_cast<SCALAR *>( \ - sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange); \ - TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_2(gpu_data_2, tensorRange); \ - TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \ - sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \ - (in_1.size()) * sizeof(SCALAR)); \ - sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(), \ - (in_2.size()) * sizeof(SCALAR)); \ - gpu_out.device(sycl_device) = gpu_1.FUNC(gpu_2); \ - sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ - (out.size()) * sizeof(SCALAR)); \ - for (int64_t i = 0; i < out.size(); ++i) { \ - SCALAR ver = reference(i); \ - ver = std::FUNC(in_1(i), in_2(i)); \ - VERIFY_IS_APPROX(out(i), ver); \ - } \ - sycl_device.deallocate(gpu_data_1); \ - sycl_device.deallocate(gpu_data_2); \ - sycl_device.deallocate(gpu_data_out); \ - } - -#define TEST_BINARY_BUILTINS_OPERATORS(SCALAR, OPERATOR, Layout) \ - { \ - /* out = in_1 OPERATOR in_2 */ \ - Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange); \ - Tensor<SCALAR, 3, Layout, int64_t> in_2(tensorRange); \ - Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \ - in_1 = in_1.random() + static_cast<SCALAR>(0.01); \ - in_2 = in_2.random() + static_cast<SCALAR>(0.01); \ - Tensor<SCALAR, 3, Layout, int64_t> reference(out); \ - SCALAR *gpu_data_1 = static_cast<SCALAR *>( \ - sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \ - SCALAR *gpu_data_2 = static_cast<SCALAR *>( \ - sycl_device.allocate(in_2.size() * sizeof(SCALAR))); \ - SCALAR *gpu_data_out = static_cast<SCALAR *>( \ - sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange); \ - TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_2(gpu_data_2, tensorRange); \ - TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \ - sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \ - (in_1.size()) * sizeof(SCALAR)); \ - sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(), \ - (in_2.size()) * sizeof(SCALAR)); \ - gpu_out.device(sycl_device) = gpu_1 OPERATOR gpu_2; \ - sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ - (out.size()) * sizeof(SCALAR)); \ - for (int64_t i = 0; i < out.size(); ++i) { \ - VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR in_2(i)); \ - } \ - sycl_device.deallocate(gpu_data_1); \ - sycl_device.deallocate(gpu_data_2); \ - sycl_device.deallocate(gpu_data_out); \ - } - -#define TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(SCALAR, OPERATOR, Layout) \ - { \ - /* out = in_1 OPERATOR 2 */ \ - Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange); \ - Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \ - in_1 = in_1.random() + static_cast<SCALAR>(0.01); \ - Tensor<SCALAR, 3, Layout, int64_t> reference(out); \ - SCALAR *gpu_data_1 = static_cast<SCALAR *>( \ - sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \ - SCALAR *gpu_data_out = static_cast<SCALAR *>( \ - sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange); \ - TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \ - sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \ - (in_1.size()) * sizeof(SCALAR)); \ - gpu_out.device(sycl_device) = gpu_1 OPERATOR 2; \ - sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ - (out.size()) * sizeof(SCALAR)); \ - for (int64_t i = 0; i < out.size(); ++i) { \ - VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR 2); \ - } \ - sycl_device.deallocate(gpu_data_1); \ - sycl_device.deallocate(gpu_data_out); \ - } - -#define TEST_BINARY_BUILTINS(SCALAR, Layout) \ - TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMax , Layout) \ - TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMin , Layout) \ - TEST_BINARY_BUILTINS_OPERATORS(SCALAR, + , Layout) \ - TEST_BINARY_BUILTINS_OPERATORS(SCALAR, - , Layout) \ - TEST_BINARY_BUILTINS_OPERATORS(SCALAR, * , Layout) \ - TEST_BINARY_BUILTINS_OPERATORS(SCALAR, / , Layout) - -static void test_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) { - int64_t sizeDim1 = 10; - int64_t sizeDim2 = 10; - int64_t sizeDim3 = 10; - array<int64_t, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; - TEST_BINARY_BUILTINS(float, RowMajor) - TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, RowMajor) - TEST_BINARY_BUILTINS(float, ColMajor) - TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, ColMajor) -} - -void test_cxx11_tensor_builtins_sycl() { - for (const auto& device :Eigen::get_sycl_supported_devices()) { - QueueInterface queueInterface(device); - Eigen::SyclDevice sycl_device(&queueInterface); - CALL_SUBTEST(test_builtin_unary_sycl(sycl_device)); - CALL_SUBTEST(test_builtin_binary_sycl(sycl_device)); - } -} diff --git a/eigen/unsupported/test/cxx11_tensor_chipping.cpp b/eigen/unsupported/test/cxx11_tensor_chipping.cpp index 89cf5c7..1832dec 100644 --- a/eigen/unsupported/test/cxx11_tensor_chipping.cpp +++ b/eigen/unsupported/test/cxx11_tensor_chipping.cpp @@ -43,7 +43,7 @@ static void test_simple_chip() VERIFY_IS_EQUAL(chip2.dimension(2), 7); VERIFY_IS_EQUAL(chip2.dimension(3), 11); for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 5; ++j) { + for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { for (int l = 0; l < 11; ++l) { VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l)); @@ -75,7 +75,7 @@ static void test_simple_chip() for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 11; ++l) { + for (int l = 0; l < 7; ++l) { VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l)); } } @@ -126,7 +126,7 @@ static void test_dynamic_chip() VERIFY_IS_EQUAL(chip2.dimension(2), 7); VERIFY_IS_EQUAL(chip2.dimension(3), 11); for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 5; ++j) { + for (int j = 0; j < 3; ++j) { for (int k = 0; k < 7; ++k) { for (int l = 0; l < 11; ++l) { VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l)); @@ -158,7 +158,7 @@ static void test_dynamic_chip() for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 11; ++l) { + for (int l = 0; l < 7; ++l) { VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l)); } } diff --git a/eigen/unsupported/test/cxx11_tensor_chipping_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_chipping_sycl.cpp deleted file mode 100644 index 39e4f0a..0000000 --- a/eigen/unsupported/test/cxx11_tensor_chipping_sycl.cpp +++ /dev/null @@ -1,622 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: <eigen@codeplay.com> -// Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_chipping_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t -#define EIGEN_USE_SYCL - -#include "main.h" - -#include <Eigen/CXX11/Tensor> - -using Eigen::Tensor; - -template <typename DataType, int DataLayout, typename IndexType> -static void test_static_chip_sycl(const Eigen::SyclDevice& sycl_device) -{ - IndexType sizeDim1 = 2; - IndexType sizeDim2 = 3; - IndexType sizeDim3 = 5; - IndexType sizeDim4 = 7; - IndexType sizeDim5 = 11; - - array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; - array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; - - Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange); - Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange); - - tensor.setRandom(); - - const size_t tensorBuffSize =tensor.size()*sizeof(DataType); - const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType); - DataType* gpu_data_tensor = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); - DataType* gpu_data_chip1 = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize)); - - TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange); - TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange); - - sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize); - gpu_chip1.device(sycl_device)=gpu_tensor.template chip<0l>(1l); - sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize); - - VERIFY_IS_EQUAL(chip1.dimension(0), sizeDim2); - VERIFY_IS_EQUAL(chip1.dimension(1), sizeDim3); - VERIFY_IS_EQUAL(chip1.dimension(2), sizeDim4); - VERIFY_IS_EQUAL(chip1.dimension(3), sizeDim5); - - for (IndexType i = 0; i < sizeDim2; ++i) { - for (IndexType j = 0; j < sizeDim3; ++j) { - for (IndexType k = 0; k < sizeDim4; ++k) { - for (IndexType l = 0; l < sizeDim5; ++l) { - VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1l,i,j,k,l)); - } - } - } - } - - array<IndexType, 4> chip2TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}}; - Tensor<DataType, 4, DataLayout,IndexType> chip2(chip2TensorRange); - const size_t chip2TensorBuffSize =chip2.size()*sizeof(DataType); - DataType* gpu_data_chip2 = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize)); - TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange); - - gpu_chip2.device(sycl_device)=gpu_tensor.template chip<1l>(1l); - sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize); - - VERIFY_IS_EQUAL(chip2.dimension(0), sizeDim1); - VERIFY_IS_EQUAL(chip2.dimension(1), sizeDim3); - VERIFY_IS_EQUAL(chip2.dimension(2), sizeDim4); - VERIFY_IS_EQUAL(chip2.dimension(3), sizeDim5); - - for (IndexType i = 0; i < sizeDim1; ++i) { - for (IndexType j = 0; j < sizeDim3; ++j) { - for (IndexType k = 0; k < sizeDim4; ++k) { - for (IndexType l = 0; l < sizeDim5; ++l) { - VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1l,j,k,l)); - } - } - } - } - - array<IndexType, 4> chip3TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}}; - Tensor<DataType, 4, DataLayout,IndexType> chip3(chip3TensorRange); - const size_t chip3TensorBuffSize =chip3.size()*sizeof(DataType); - DataType* gpu_data_chip3 = static_cast<DataType*>(sycl_device.allocate(chip3TensorBuffSize)); - TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip3(gpu_data_chip3, chip3TensorRange); - - gpu_chip3.device(sycl_device)=gpu_tensor.template chip<2l>(2l); - sycl_device.memcpyDeviceToHost(chip3.data(), gpu_data_chip3, chip3TensorBuffSize); - - VERIFY_IS_EQUAL(chip3.dimension(0), sizeDim1); - VERIFY_IS_EQUAL(chip3.dimension(1), sizeDim2); - VERIFY_IS_EQUAL(chip3.dimension(2), sizeDim4); - VERIFY_IS_EQUAL(chip3.dimension(3), sizeDim5); - - for (IndexType i = 0; i < sizeDim1; ++i) { - for (IndexType j = 0; j < sizeDim2; ++j) { - for (IndexType k = 0; k < sizeDim4; ++k) { - for (IndexType l = 0; l < sizeDim5; ++l) { - VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2l,k,l)); - } - } - } - } - - array<IndexType, 4> chip4TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}}; - Tensor<DataType, 4, DataLayout,IndexType> chip4(chip4TensorRange); - const size_t chip4TensorBuffSize =chip4.size()*sizeof(DataType); - DataType* gpu_data_chip4 = static_cast<DataType*>(sycl_device.allocate(chip4TensorBuffSize)); - TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip4(gpu_data_chip4, chip4TensorRange); - - gpu_chip4.device(sycl_device)=gpu_tensor.template chip<3l>(5l); - sycl_device.memcpyDeviceToHost(chip4.data(), gpu_data_chip4, chip4TensorBuffSize); - - VERIFY_IS_EQUAL(chip4.dimension(0), sizeDim1); - VERIFY_IS_EQUAL(chip4.dimension(1), sizeDim2); - VERIFY_IS_EQUAL(chip4.dimension(2), sizeDim3); - VERIFY_IS_EQUAL(chip4.dimension(3), sizeDim5); - - for (IndexType i = 0; i < sizeDim1; ++i) { - for (IndexType j = 0; j < sizeDim2; ++j) { - for (IndexType k = 0; k < sizeDim3; ++k) { - for (IndexType l = 0; l < sizeDim5; ++l) { - VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5l,l)); - } - } - } - } - - - array<IndexType, 4> chip5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; - Tensor<DataType, 4, DataLayout,IndexType> chip5(chip5TensorRange); - const size_t chip5TensorBuffSize =chip5.size()*sizeof(DataType); - DataType* gpu_data_chip5 = static_cast<DataType*>(sycl_device.allocate(chip5TensorBuffSize)); - TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip5(gpu_data_chip5, chip5TensorRange); - - gpu_chip5.device(sycl_device)=gpu_tensor.template chip<4l>(7l); - sycl_device.memcpyDeviceToHost(chip5.data(), gpu_data_chip5, chip5TensorBuffSize); - - VERIFY_IS_EQUAL(chip5.dimension(0), sizeDim1); - VERIFY_IS_EQUAL(chip5.dimension(1), sizeDim2); - VERIFY_IS_EQUAL(chip5.dimension(2), sizeDim3); - VERIFY_IS_EQUAL(chip5.dimension(3), sizeDim4); - - for (IndexType i = 0; i < sizeDim1; ++i) { - for (IndexType j = 0; j < sizeDim2; ++j) { - for (IndexType k = 0; k < sizeDim3; ++k) { - for (IndexType l = 0; l < sizeDim4; ++l) { - VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7l)); - } - } - } - } - - sycl_device.deallocate(gpu_data_tensor); - sycl_device.deallocate(gpu_data_chip1); - sycl_device.deallocate(gpu_data_chip2); - sycl_device.deallocate(gpu_data_chip3); - sycl_device.deallocate(gpu_data_chip4); - sycl_device.deallocate(gpu_data_chip5); -} - -template <typename DataType, int DataLayout, typename IndexType> -static void test_dynamic_chip_sycl(const Eigen::SyclDevice& sycl_device) -{ - IndexType sizeDim1 = 2; - IndexType sizeDim2 = 3; - IndexType sizeDim3 = 5; - IndexType sizeDim4 = 7; - IndexType sizeDim5 = 11; - - array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; - array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; - - Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange); - Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange); - - tensor.setRandom(); - - const size_t tensorBuffSize =tensor.size()*sizeof(DataType); - const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType); - DataType* gpu_data_tensor = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); - DataType* gpu_data_chip1 = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize)); - - TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange); - TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange); - - sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize); - gpu_chip1.device(sycl_device)=gpu_tensor.chip(1l,0l); - sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize); - - VERIFY_IS_EQUAL(chip1.dimension(0), sizeDim2); - VERIFY_IS_EQUAL(chip1.dimension(1), sizeDim3); - VERIFY_IS_EQUAL(chip1.dimension(2), sizeDim4); - VERIFY_IS_EQUAL(chip1.dimension(3), sizeDim5); - - for (IndexType i = 0; i < sizeDim2; ++i) { - for (IndexType j = 0; j < sizeDim3; ++j) { - for (IndexType k = 0; k < sizeDim4; ++k) { - for (IndexType l = 0; l < sizeDim5; ++l) { - VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1l,i,j,k,l)); - } - } - } - } - - array<IndexType, 4> chip2TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}}; - Tensor<DataType, 4, DataLayout,IndexType> chip2(chip2TensorRange); - const size_t chip2TensorBuffSize =chip2.size()*sizeof(DataType); - DataType* gpu_data_chip2 = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize)); - TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange); - - gpu_chip2.device(sycl_device)=gpu_tensor.chip(1l,1l); - sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize); - - VERIFY_IS_EQUAL(chip2.dimension(0), sizeDim1); - VERIFY_IS_EQUAL(chip2.dimension(1), sizeDim3); - VERIFY_IS_EQUAL(chip2.dimension(2), sizeDim4); - VERIFY_IS_EQUAL(chip2.dimension(3), sizeDim5); - - for (IndexType i = 0; i < sizeDim1; ++i) { - for (IndexType j = 0; j < sizeDim3; ++j) { - for (IndexType k = 0; k < sizeDim4; ++k) { - for (IndexType l = 0; l < sizeDim5; ++l) { - VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1l,j,k,l)); - } - } - } - } - - array<IndexType, 4> chip3TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}}; - Tensor<DataType, 4, DataLayout,IndexType> chip3(chip3TensorRange); - const size_t chip3TensorBuffSize =chip3.size()*sizeof(DataType); - DataType* gpu_data_chip3 = static_cast<DataType*>(sycl_device.allocate(chip3TensorBuffSize)); - TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip3(gpu_data_chip3, chip3TensorRange); - - gpu_chip3.device(sycl_device)=gpu_tensor.chip(2l,2l); - sycl_device.memcpyDeviceToHost(chip3.data(), gpu_data_chip3, chip3TensorBuffSize); - - VERIFY_IS_EQUAL(chip3.dimension(0), sizeDim1); - VERIFY_IS_EQUAL(chip3.dimension(1), sizeDim2); - VERIFY_IS_EQUAL(chip3.dimension(2), sizeDim4); - VERIFY_IS_EQUAL(chip3.dimension(3), sizeDim5); - - for (IndexType i = 0; i < sizeDim1; ++i) { - for (IndexType j = 0; j < sizeDim2; ++j) { - for (IndexType k = 0; k < sizeDim4; ++k) { - for (IndexType l = 0; l < sizeDim5; ++l) { - VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2l,k,l)); - } - } - } - } - - array<IndexType, 4> chip4TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}}; - Tensor<DataType, 4, DataLayout,IndexType> chip4(chip4TensorRange); - const size_t chip4TensorBuffSize =chip4.size()*sizeof(DataType); - DataType* gpu_data_chip4 = static_cast<DataType*>(sycl_device.allocate(chip4TensorBuffSize)); - TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip4(gpu_data_chip4, chip4TensorRange); - - gpu_chip4.device(sycl_device)=gpu_tensor.chip(5l,3l); - sycl_device.memcpyDeviceToHost(chip4.data(), gpu_data_chip4, chip4TensorBuffSize); - - VERIFY_IS_EQUAL(chip4.dimension(0), sizeDim1); - VERIFY_IS_EQUAL(chip4.dimension(1), sizeDim2); - VERIFY_IS_EQUAL(chip4.dimension(2), sizeDim3); - VERIFY_IS_EQUAL(chip4.dimension(3), sizeDim5); - - for (IndexType i = 0; i < sizeDim1; ++i) { - for (IndexType j = 0; j < sizeDim2; ++j) { - for (IndexType k = 0; k < sizeDim3; ++k) { - for (IndexType l = 0; l < sizeDim5; ++l) { - VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5l,l)); - } - } - } - } - - - array<IndexType, 4> chip5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; - Tensor<DataType, 4, DataLayout,IndexType> chip5(chip5TensorRange); - const size_t chip5TensorBuffSize =chip5.size()*sizeof(DataType); - DataType* gpu_data_chip5 = static_cast<DataType*>(sycl_device.allocate(chip5TensorBuffSize)); - TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip5(gpu_data_chip5, chip5TensorRange); - - gpu_chip5.device(sycl_device)=gpu_tensor.chip(7l,4l); - sycl_device.memcpyDeviceToHost(chip5.data(), gpu_data_chip5, chip5TensorBuffSize); - - VERIFY_IS_EQUAL(chip5.dimension(0), sizeDim1); - VERIFY_IS_EQUAL(chip5.dimension(1), sizeDim2); - VERIFY_IS_EQUAL(chip5.dimension(2), sizeDim3); - VERIFY_IS_EQUAL(chip5.dimension(3), sizeDim4); - - for (IndexType i = 0; i < sizeDim1; ++i) { - for (IndexType j = 0; j < sizeDim2; ++j) { - for (IndexType k = 0; k < sizeDim3; ++k) { - for (IndexType l = 0; l < sizeDim4; ++l) { - VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7l)); - } - } - } - } - sycl_device.deallocate(gpu_data_tensor); - sycl_device.deallocate(gpu_data_chip1); - sycl_device.deallocate(gpu_data_chip2); - sycl_device.deallocate(gpu_data_chip3); - sycl_device.deallocate(gpu_data_chip4); - sycl_device.deallocate(gpu_data_chip5); -} - -template <typename DataType, int DataLayout, typename IndexType> -static void test_chip_in_expr(const Eigen::SyclDevice& sycl_device) { - - IndexType sizeDim1 = 2; - IndexType sizeDim2 = 3; - IndexType sizeDim3 = 5; - IndexType sizeDim4 = 7; - IndexType sizeDim5 = 11; - - array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; - array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; - - Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange); - - Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange); - Tensor<DataType, 4, DataLayout,IndexType> tensor1(chip1TensorRange); - tensor.setRandom(); - tensor1.setRandom(); - - const size_t tensorBuffSize =tensor.size()*sizeof(DataType); - const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType); - DataType* gpu_data_tensor = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); - DataType* gpu_data_chip1 = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize)); - DataType* gpu_data_tensor1 = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize)); - - TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange); - TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange); - TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_tensor1(gpu_data_tensor1, chip1TensorRange); - - - sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize); - sycl_device.memcpyHostToDevice(gpu_data_tensor1, tensor1.data(), chip1TensorBuffSize); - gpu_chip1.device(sycl_device)=gpu_tensor.template chip<0l>(0l) + gpu_tensor1; - sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize); - - for (int i = 0; i < sizeDim2; ++i) { - for (int j = 0; j < sizeDim3; ++j) { - for (int k = 0; k < sizeDim4; ++k) { - for (int l = 0; l < sizeDim5; ++l) { - float expected = tensor(0l,i,j,k,l) + tensor1(i,j,k,l); - VERIFY_IS_EQUAL(chip1(i,j,k,l), expected); - } - } - } - } - - array<IndexType, 3> chip2TensorRange = {{sizeDim2, sizeDim4, sizeDim5}}; - Tensor<DataType, 3, DataLayout,IndexType> tensor2(chip2TensorRange); - Tensor<DataType, 3, DataLayout,IndexType> chip2(chip2TensorRange); - tensor2.setRandom(); - const size_t chip2TensorBuffSize =tensor2.size()*sizeof(DataType); - DataType* gpu_data_tensor2 = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize)); - DataType* gpu_data_chip2 = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize)); - TensorMap<Tensor<DataType, 3, DataLayout,IndexType>> gpu_tensor2(gpu_data_tensor2, chip2TensorRange); - TensorMap<Tensor<DataType, 3, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange); - - sycl_device.memcpyHostToDevice(gpu_data_tensor2, tensor2.data(), chip2TensorBuffSize); - gpu_chip2.device(sycl_device)=gpu_tensor.template chip<0l>(0l).template chip<1l>(2l) + gpu_tensor2; - sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize); - - for (int i = 0; i < sizeDim2; ++i) { - for (int j = 0; j < sizeDim4; ++j) { - for (int k = 0; k < sizeDim5; ++k) { - float expected = tensor(0l,i,2l,j,k) + tensor2(i,j,k); - VERIFY_IS_EQUAL(chip2(i,j,k), expected); - } - } - } - sycl_device.deallocate(gpu_data_tensor); - sycl_device.deallocate(gpu_data_tensor1); - sycl_device.deallocate(gpu_data_chip1); - sycl_device.deallocate(gpu_data_tensor2); - sycl_device.deallocate(gpu_data_chip2); -} - -template <typename DataType, int DataLayout, typename IndexType> -static void test_chip_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device) -{ - - IndexType sizeDim1 = 2; - IndexType sizeDim2 = 3; - IndexType sizeDim3 = 5; - IndexType sizeDim4 = 7; - IndexType sizeDim5 = 11; - - array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; - array<IndexType, 4> input2TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; - - Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange); - Tensor<DataType, 5, DataLayout,IndexType> input1(tensorRange); - Tensor<DataType, 4, DataLayout,IndexType> input2(input2TensorRange); - input1.setRandom(); - input2.setRandom(); - - - const size_t tensorBuffSize =tensor.size()*sizeof(DataType); - const size_t input2TensorBuffSize =input2.size()*sizeof(DataType); - DataType* gpu_data_tensor = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); - DataType* gpu_data_input1 = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); - DataType* gpu_data_input2 = static_cast<DataType*>(sycl_device.allocate(input2TensorBuffSize)); - - TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange); - TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_input1(gpu_data_input1, tensorRange); - TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input2(gpu_data_input2, input2TensorRange); - - sycl_device.memcpyHostToDevice(gpu_data_input1, input1.data(), tensorBuffSize); - gpu_tensor.device(sycl_device)=gpu_input1; - sycl_device.memcpyHostToDevice(gpu_data_input2, input2.data(), input2TensorBuffSize); - gpu_tensor.template chip<0l>(1l).device(sycl_device)=gpu_input2; - sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize); - - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { - for (int l = 0; l < sizeDim4; ++l) { - for (int m = 0; m < sizeDim5; ++m) { - if (i != 1) { - VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); - } else { - VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input2(j,k,l,m)); - } - } - } - } - } - } - - gpu_tensor.device(sycl_device)=gpu_input1; - array<IndexType, 4> input3TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}}; - Tensor<DataType, 4, DataLayout,IndexType> input3(input3TensorRange); - input3.setRandom(); - - const size_t input3TensorBuffSize =input3.size()*sizeof(DataType); - DataType* gpu_data_input3 = static_cast<DataType*>(sycl_device.allocate(input3TensorBuffSize)); - TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input3(gpu_data_input3, input3TensorRange); - - sycl_device.memcpyHostToDevice(gpu_data_input3, input3.data(), input3TensorBuffSize); - gpu_tensor.template chip<1l>(1l).device(sycl_device)=gpu_input3; - sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize); - - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k <sizeDim3; ++k) { - for (int l = 0; l < sizeDim4; ++l) { - for (int m = 0; m < sizeDim5; ++m) { - if (j != 1) { - VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); - } else { - VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input3(i,k,l,m)); - } - } - } - } - } - } - - gpu_tensor.device(sycl_device)=gpu_input1; - array<IndexType, 4> input4TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}}; - Tensor<DataType, 4, DataLayout,IndexType> input4(input4TensorRange); - input4.setRandom(); - - const size_t input4TensorBuffSize =input4.size()*sizeof(DataType); - DataType* gpu_data_input4 = static_cast<DataType*>(sycl_device.allocate(input4TensorBuffSize)); - TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input4(gpu_data_input4, input4TensorRange); - - sycl_device.memcpyHostToDevice(gpu_data_input4, input4.data(), input4TensorBuffSize); - gpu_tensor.template chip<2l>(3l).device(sycl_device)=gpu_input4; - sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize); - - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k <sizeDim3; ++k) { - for (int l = 0; l < sizeDim4; ++l) { - for (int m = 0; m < sizeDim5; ++m) { - if (k != 3) { - VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); - } else { - VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input4(i,j,l,m)); - } - } - } - } - } - } - - gpu_tensor.device(sycl_device)=gpu_input1; - array<IndexType, 4> input5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}}; - Tensor<DataType, 4, DataLayout,IndexType> input5(input5TensorRange); - input5.setRandom(); - - const size_t input5TensorBuffSize =input5.size()*sizeof(DataType); - DataType* gpu_data_input5 = static_cast<DataType*>(sycl_device.allocate(input5TensorBuffSize)); - TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input5(gpu_data_input5, input5TensorRange); - - sycl_device.memcpyHostToDevice(gpu_data_input5, input5.data(), input5TensorBuffSize); - gpu_tensor.template chip<3l>(4l).device(sycl_device)=gpu_input5; - sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize); - - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k <sizeDim3; ++k) { - for (int l = 0; l < sizeDim4; ++l) { - for (int m = 0; m < sizeDim5; ++m) { - if (l != 4) { - VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); - } else { - VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input5(i,j,k,m)); - } - } - } - } - } - } - gpu_tensor.device(sycl_device)=gpu_input1; - array<IndexType, 4> input6TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; - Tensor<DataType, 4, DataLayout,IndexType> input6(input6TensorRange); - input6.setRandom(); - - const size_t input6TensorBuffSize =input6.size()*sizeof(DataType); - DataType* gpu_data_input6 = static_cast<DataType*>(sycl_device.allocate(input6TensorBuffSize)); - TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input6(gpu_data_input6, input6TensorRange); - - sycl_device.memcpyHostToDevice(gpu_data_input6, input6.data(), input6TensorBuffSize); - gpu_tensor.template chip<4l>(5l).device(sycl_device)=gpu_input6; - sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize); - - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k <sizeDim3; ++k) { - for (int l = 0; l < sizeDim4; ++l) { - for (int m = 0; m < sizeDim5; ++m) { - if (m != 5) { - VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); - } else { - VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input6(i,j,k,l)); - } - } - } - } - } - } - - - gpu_tensor.device(sycl_device)=gpu_input1; - Tensor<DataType, 5, DataLayout,IndexType> input7(tensorRange); - input7.setRandom(); - - DataType* gpu_data_input7 = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); - TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_input7(gpu_data_input7, tensorRange); - - sycl_device.memcpyHostToDevice(gpu_data_input7, input7.data(), tensorBuffSize); - gpu_tensor.chip(0l,0l).device(sycl_device)=gpu_input7.chip(0l,0l); - sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize); - - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k <sizeDim3; ++k) { - for (int l = 0; l < sizeDim4; ++l) { - for (int m = 0; m < sizeDim5; ++m) { - if (i != 0) { - VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); - } else { - VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input7(i,j,k,l,m)); - } - } - } - } - } - } - sycl_device.deallocate(gpu_data_tensor); - sycl_device.deallocate(gpu_data_input1); - sycl_device.deallocate(gpu_data_input2); - sycl_device.deallocate(gpu_data_input3); - sycl_device.deallocate(gpu_data_input4); - sycl_device.deallocate(gpu_data_input5); - sycl_device.deallocate(gpu_data_input6); - sycl_device.deallocate(gpu_data_input7); - -} - -template<typename DataType, typename dev_Selector> void sycl_chipping_test_per_device(dev_Selector s){ - QueueInterface queueInterface(s); - auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_static_chip_sycl<DataType, RowMajor, int64_t>(sycl_device); - test_static_chip_sycl<DataType, ColMajor, int64_t>(sycl_device); - test_dynamic_chip_sycl<DataType, RowMajor, int64_t>(sycl_device); - test_dynamic_chip_sycl<DataType, ColMajor, int64_t>(sycl_device); - test_chip_in_expr<DataType, RowMajor, int64_t>(sycl_device); - test_chip_in_expr<DataType, ColMajor, int64_t>(sycl_device); - test_chip_as_lvalue_sycl<DataType, RowMajor, int64_t>(sycl_device); - test_chip_as_lvalue_sycl<DataType, ColMajor, int64_t>(sycl_device); -} -void test_cxx11_tensor_chipping_sycl() -{ - for (const auto& device :Eigen::get_sycl_supported_devices()) { - CALL_SUBTEST(sycl_chipping_test_per_device<float>(device)); - } -} diff --git a/eigen/unsupported/test/cxx11_tensor_concatenation_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_concatenation_sycl.cpp deleted file mode 100644 index e3023a3..0000000 --- a/eigen/unsupported/test/cxx11_tensor_concatenation_sycl.cpp +++ /dev/null @@ -1,180 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: <eigen@codeplay.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_concatenation_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t -#define EIGEN_USE_SYCL - -#include "main.h" -#include <unsupported/Eigen/CXX11/Tensor> - -using Eigen::Tensor; - -template<typename DataType, int DataLayout, typename IndexType> -static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device) -{ - IndexType leftDim1 = 2; - IndexType leftDim2 = 3; - IndexType leftDim3 = 1; - Eigen::array<IndexType, 3> leftRange = {{leftDim1, leftDim2, leftDim3}}; - IndexType rightDim1 = 2; - IndexType rightDim2 = 3; - IndexType rightDim3 = 1; - Eigen::array<IndexType, 3> rightRange = {{rightDim1, rightDim2, rightDim3}}; - - //IndexType concatDim1 = 3; -// IndexType concatDim2 = 3; -// IndexType concatDim3 = 1; - //Eigen::array<IndexType, 3> concatRange = {{concatDim1, concatDim2, concatDim3}}; - - Tensor<DataType, 3, DataLayout, IndexType> left(leftRange); - Tensor<DataType, 3, DataLayout, IndexType> right(rightRange); - left.setRandom(); - right.setRandom(); - - DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType))); - DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType))); - - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, leftRange); - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, rightRange); - sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType)); - sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType)); - /// - Tensor<DataType, 3, DataLayout, IndexType> concatenation1(leftDim1+rightDim1, leftDim2, leftDim3); - DataType * gpu_out_data1 = static_cast<DataType*>(sycl_device.allocate(concatenation1.dimensions().TotalSize()*sizeof(DataType))); - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out1(gpu_out_data1, concatenation1.dimensions()); - - //concatenation = left.concatenate(right, 0); - gpu_out1.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 0); - sycl_device.memcpyDeviceToHost(concatenation1.data(), gpu_out_data1,(concatenation1.dimensions().TotalSize())*sizeof(DataType)); - - VERIFY_IS_EQUAL(concatenation1.dimension(0), 4); - VERIFY_IS_EQUAL(concatenation1.dimension(1), 3); - VERIFY_IS_EQUAL(concatenation1.dimension(2), 1); - for (IndexType j = 0; j < 3; ++j) { - for (IndexType i = 0; i < 2; ++i) { - VERIFY_IS_EQUAL(concatenation1(i, j, 0), left(i, j, 0)); - } - for (IndexType i = 2; i < 4; ++i) { - VERIFY_IS_EQUAL(concatenation1(i, j, 0), right(i - 2, j, 0)); - } - } - - sycl_device.deallocate(gpu_out_data1); - Tensor<DataType, 3, DataLayout, IndexType> concatenation2(leftDim1, leftDim2 +rightDim2, leftDim3); - DataType * gpu_out_data2 = static_cast<DataType*>(sycl_device.allocate(concatenation2.dimensions().TotalSize()*sizeof(DataType))); - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out2(gpu_out_data2, concatenation2.dimensions()); - gpu_out2.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 1); - sycl_device.memcpyDeviceToHost(concatenation2.data(), gpu_out_data2,(concatenation2.dimensions().TotalSize())*sizeof(DataType)); - - //concatenation = left.concatenate(right, 1); - VERIFY_IS_EQUAL(concatenation2.dimension(0), 2); - VERIFY_IS_EQUAL(concatenation2.dimension(1), 6); - VERIFY_IS_EQUAL(concatenation2.dimension(2), 1); - for (IndexType i = 0; i < 2; ++i) { - for (IndexType j = 0; j < 3; ++j) { - VERIFY_IS_EQUAL(concatenation2(i, j, 0), left(i, j, 0)); - } - for (IndexType j = 3; j < 6; ++j) { - VERIFY_IS_EQUAL(concatenation2(i, j, 0), right(i, j - 3, 0)); - } - } - sycl_device.deallocate(gpu_out_data2); - Tensor<DataType, 3, DataLayout, IndexType> concatenation3(leftDim1, leftDim2, leftDim3+rightDim3); - DataType * gpu_out_data3 = static_cast<DataType*>(sycl_device.allocate(concatenation3.dimensions().TotalSize()*sizeof(DataType))); - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out3(gpu_out_data3, concatenation3.dimensions()); - gpu_out3.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 2); - sycl_device.memcpyDeviceToHost(concatenation3.data(), gpu_out_data3,(concatenation3.dimensions().TotalSize())*sizeof(DataType)); - - //concatenation = left.concatenate(right, 2); - VERIFY_IS_EQUAL(concatenation3.dimension(0), 2); - VERIFY_IS_EQUAL(concatenation3.dimension(1), 3); - VERIFY_IS_EQUAL(concatenation3.dimension(2), 2); - for (IndexType i = 0; i < 2; ++i) { - for (IndexType j = 0; j < 3; ++j) { - VERIFY_IS_EQUAL(concatenation3(i, j, 0), left(i, j, 0)); - VERIFY_IS_EQUAL(concatenation3(i, j, 1), right(i, j, 0)); - } - } - sycl_device.deallocate(gpu_out_data3); - sycl_device.deallocate(gpu_in1_data); - sycl_device.deallocate(gpu_in2_data); -} -template<typename DataType, int DataLayout, typename IndexType> -static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device) -{ - - IndexType leftDim1 = 2; - IndexType leftDim2 = 3; - Eigen::array<IndexType, 2> leftRange = {{leftDim1, leftDim2}}; - - IndexType rightDim1 = 2; - IndexType rightDim2 = 3; - Eigen::array<IndexType, 2> rightRange = {{rightDim1, rightDim2}}; - - IndexType concatDim1 = 4; - IndexType concatDim2 = 3; - Eigen::array<IndexType, 2> resRange = {{concatDim1, concatDim2}}; - - Tensor<DataType, 2, DataLayout, IndexType> left(leftRange); - Tensor<DataType, 2, DataLayout, IndexType> right(rightRange); - Tensor<DataType, 2, DataLayout, IndexType> result(resRange); - - left.setRandom(); - right.setRandom(); - result.setRandom(); - - DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType))); - DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType))); - DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType))); - - - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_in1(gpu_in1_data, leftRange); - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_in2(gpu_in2_data, rightRange); - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(gpu_out_data, resRange); - - sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType)); - sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType)); - sycl_device.memcpyHostToDevice(gpu_out_data, result.data(),(result.dimensions().TotalSize())*sizeof(DataType)); - -// t1.concatenate(t2, 0) = result; - gpu_in1.concatenate(gpu_in2, 0).device(sycl_device) =gpu_out; - sycl_device.memcpyDeviceToHost(left.data(), gpu_in1_data,(left.dimensions().TotalSize())*sizeof(DataType)); - sycl_device.memcpyDeviceToHost(right.data(), gpu_in2_data,(right.dimensions().TotalSize())*sizeof(DataType)); - - for (IndexType i = 0; i < 2; ++i) { - for (IndexType j = 0; j < 3; ++j) { - VERIFY_IS_EQUAL(left(i, j), result(i, j)); - VERIFY_IS_EQUAL(right(i, j), result(i+2, j)); - } - } - sycl_device.deallocate(gpu_in1_data); - sycl_device.deallocate(gpu_in2_data); - sycl_device.deallocate(gpu_out_data); -} - - -template <typename DataType, typename Dev_selector> void tensorConcat_perDevice(Dev_selector s){ - QueueInterface queueInterface(s); - auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_simple_concatenation<DataType, RowMajor, int64_t>(sycl_device); - test_simple_concatenation<DataType, ColMajor, int64_t>(sycl_device); - test_concatenation_as_lvalue<DataType, ColMajor, int64_t>(sycl_device); -} -void test_cxx11_tensor_concatenation_sycl() { - for (const auto& device :Eigen::get_sycl_supported_devices()) { - CALL_SUBTEST(tensorConcat_perDevice<float>(device)); - } -} diff --git a/eigen/unsupported/test/cxx11_tensor_contract_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_contract_sycl.cpp deleted file mode 100644 index 5bace66..0000000 --- a/eigen/unsupported/test/cxx11_tensor_contract_sycl.cpp +++ /dev/null @@ -1,290 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: <eigen@codeplay.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_contract_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t -#define EIGEN_USE_SYCL - -#include <iostream> -#include <chrono> -#include <ctime> - -#include "main.h" -#include <unsupported/Eigen/CXX11/Tensor> - -using Eigen::array; -using Eigen::SyclDevice; -using Eigen::Tensor; -using Eigen::TensorMap; -template<int DataLayout, typename DataType, typename IndexType, typename Device> -void static test_sycl_contraction(const Device& sycl_device, IndexType m_size, IndexType k_size, IndexType n_size) -{ - typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair; - static const DataType error_threshold =1e-4f; -// std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; - // with these dimensions, the output has 300 * 140 elements, which is - // more than 30 * 1024, which is the number of threads in blocks on - // a 15 SM GK110 GPU - Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size); - Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size); - Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size); - Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(m_size, n_size); -// Eigen::array<DimPair, 1> dims(DimPair(1, 0)); - Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}}; - Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}}; - Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}}; - Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}}; - - t_left.setRandom(); - t_right.setRandom(); - - std::size_t t_left_bytes = t_left.size() * sizeof(DataType); - std::size_t t_right_bytes = t_right.size() * sizeof(DataType); - std::size_t t_result_bytes = t_result.size() * sizeof(DataType); - - DataType * d_t_left = static_cast<DataType*>(sycl_device.allocate(t_left_bytes)); - DataType * d_t_right = static_cast<DataType*>(sycl_device.allocate(t_right_bytes)); - DataType * d_t_result = static_cast<DataType*>(sycl_device.allocate(t_result_bytes)); - - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_result(d_t_result, result_dims); - - sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes); - sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes); - - gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); - sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes); - - t_result = t_left.contract(t_right, dims); - - for (IndexType i = 0; i < t_result.size(); i++) { - if (static_cast<DataType>(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) { - continue; - } - if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), error_threshold)) { - continue; - } - std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i) - << " vs " << t_result_gpu(i) << std::endl; - assert(false); - } - sycl_device.deallocate(d_t_left); - sycl_device.deallocate(d_t_right); - sycl_device.deallocate(d_t_result); -} - -template<int DataLayout, typename DataType, typename IndexType, typename Device> -void test_TF(const Device& sycl_device) -{ - typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair; - static const DataType error_threshold =1e-4f; - Eigen::array<IndexType, 2> left_dims = {{2, 3}}; - Eigen::array<IndexType, 2> right_dims = {{3, 1}}; - Eigen::array<IndexType, 2> res_dims = {{2, 1}}; - Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}}; - - - Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims); - Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims); - Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims); - Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims); - - t_left.data()[0] = 1.0f; - t_left.data()[1] = 2.0f; - t_left.data()[2] = 3.0f; - t_left.data()[3] = 4.0f; - t_left.data()[4] = 5.0f; - t_left.data()[5] = 6.0f; - - t_right.data()[0] = -1.0f; - t_right.data()[1] = 0.5f; - t_right.data()[2] = 2.0f; - - std::size_t t_left_bytes = t_left.size() * sizeof(DataType); - std::size_t t_right_bytes = t_right.size() * sizeof(DataType); - std::size_t t_result_bytes = t_result.size()*sizeof(DataType); - - - DataType * d_t_left = static_cast<DataType*>(sycl_device.allocate(t_left_bytes)); - DataType * d_t_right = static_cast<DataType*>(sycl_device.allocate(t_right_bytes)); - DataType * d_t_result = static_cast<DataType*>(sycl_device.allocate(t_result_bytes)); - - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_result(d_t_result, res_dims); - - sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes); - sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes); - - gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); - sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes); - - t_result = t_left.contract(t_right, dims); - - for (IndexType i = 0; i < t_result.size(); i++) { - if (static_cast<DataType>(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) { - continue; - } - if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), error_threshold)) { - continue; - } - std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i) - << " vs " << t_result_gpu(i) << std::endl; - assert(false); - } - sycl_device.deallocate(d_t_left); - sycl_device.deallocate(d_t_right); - sycl_device.deallocate(d_t_result); - - -} - -template<int DataLayout, typename DataType, typename IndexType, typename Device> -void test_scalar(const Device& sycl_device, IndexType m_size, IndexType k_size, IndexType n_size) -{ - //std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; - // with these dimensions, the output has 300 * 140 elements, which is - // more than 30 * 1024, which is the number of threads in blocks on - // a 15 SM GK110 GPU - typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair; - static const DataType error_threshold =1e-4f; - Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size); - Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size); - Tensor<DataType, 0, DataLayout, IndexType> t_result; - Tensor<DataType, 0, DataLayout, IndexType> t_result_gpu; - Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(1, 1)}}; - Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}}; - Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}}; - t_left.setRandom(); - t_right.setRandom(); - - std::size_t t_left_bytes = t_left.size() * sizeof(DataType); - std::size_t t_right_bytes = t_right.size() * sizeof(DataType); - std::size_t t_result_bytes = sizeof(DataType); - - - DataType * d_t_left = static_cast<DataType*>(sycl_device.allocate(t_left_bytes)); - DataType * d_t_right = static_cast<DataType*>(sycl_device.allocate(t_right_bytes)); - DataType * d_t_result = static_cast<DataType*>(sycl_device.allocate(t_result_bytes)); - - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 0, DataLayout, IndexType> > gpu_t_result(d_t_result); - - sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes); - sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes); - - gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); - sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes); - - t_result = t_left.contract(t_right, dims); - - if (static_cast<DataType>(fabs(t_result() - t_result_gpu())) > error_threshold && - !Eigen::internal::isApprox(t_result(), t_result_gpu(), error_threshold)) { - std::cout << "mismatch detected: " << t_result() - << " vs " << t_result_gpu() << std::endl; - assert(false); - } - - sycl_device.deallocate(d_t_left); - sycl_device.deallocate(d_t_right); - sycl_device.deallocate(d_t_result); -} - - -template<int DataLayout, typename DataType, typename IndexType, typename Device> -void test_sycl_contraction_m(const Device& sycl_device) { - for (IndexType k = 32; k < 256; k++) { - test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, k, 128, 128); - } -} - -template<int DataLayout, typename DataType, typename IndexType, typename Device> -void test_sycl_contraction_k(const Device& sycl_device) { - for (IndexType k = 32; k < 256; k++) { - test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, k, 128); - } -} - -template<int DataLayout, typename DataType, typename IndexType, typename Device> -void test_sycl_contraction_n(const Device& sycl_device) { - for (IndexType k = 32; k < 256; k++) { - test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, 128, k); - } -} - - -template<int DataLayout, typename DataType, typename IndexType, typename Device> -void test_sycl_contraction_sizes(const Device& sycl_device) { - IndexType m_sizes[] = { 31, 39, 63, 64, 65, - 127, 129, 255, 257 , 511, - 512, 513, 1023, 1024, 1025}; - - IndexType n_sizes[] = { 31, 39, 63, 64, 65, - 127, 129, 255, 257, 511, - 512, 513, 1023, 1024, 1025}; - - IndexType k_sizes[] = { 31, 39, 63, 64, 65, - 95, 96, 127, 129, 255, - 257, 511, 512, 513, 1023, - 1024, 1025}; - - for (IndexType i = 0; i < 15; i++) { - for (IndexType j = 0; j < 15; j++) { - for (IndexType k = 0; k < 17; k++) { - test_sycl_contraction<DataLayout, DataType,IndexType>(sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]); - } - } - } -} - -template <typename Dev_selector> void tensorContractionPerDevice(Dev_selector& s){ - QueueInterface queueInterface(s); - auto sycl_device=Eigen::SyclDevice(&queueInterface); - test_sycl_contraction<ColMajor, float,int64_t>(sycl_device, 32, 32, 32); - test_sycl_contraction<RowMajor,float,int64_t>(sycl_device, 32, 32, 32); - test_scalar<ColMajor,float,int64_t>(sycl_device, 32, 32, 32); - test_scalar<RowMajor,float,int64_t>(sycl_device, 32, 32, 32); - std::chrono::time_point<std::chrono::system_clock> start, end; - start = std::chrono::system_clock::now(); - test_sycl_contraction<ColMajor,float,int64_t>(sycl_device, 128, 128, 128); - test_sycl_contraction<RowMajor,float,int64_t>(sycl_device, 128, 128, 128); - test_scalar<ColMajor,float,int64_t>(sycl_device, 128, 128, 128); - test_scalar<RowMajor,float,int64_t>(sycl_device, 128, 128, 128); - test_sycl_contraction_m<ColMajor, float, int64_t>(sycl_device); - test_sycl_contraction_m<RowMajor, float, int64_t>(sycl_device); - test_sycl_contraction_n<ColMajor, float, int64_t>(sycl_device); - test_sycl_contraction_n<RowMajor, float, int64_t>(sycl_device); - test_sycl_contraction_k<ColMajor, float, int64_t>(sycl_device); - test_sycl_contraction_k<RowMajor, float, int64_t>(sycl_device); - test_sycl_contraction_sizes<ColMajor, float, int64_t>(sycl_device); - test_sycl_contraction_sizes<RowMajor, float, int64_t>(sycl_device); - test_TF<RowMajor, float, int64_t>(sycl_device); - test_TF<ColMajor, float, int64_t>(sycl_device); - - end = std::chrono::system_clock::now(); - std::chrono::duration<double> elapsed_seconds = end-start; - std::time_t end_time = std::chrono::system_clock::to_time_t(end); - std::cout << "finished computation at " << std::ctime(&end_time) - << "elapsed time: " << elapsed_seconds.count() << "s\n"; - -} - -void test_cxx11_tensor_contract_sycl() { - for (const auto& device :Eigen::get_sycl_supported_devices()) { - CALL_SUBTEST(tensorContractionPerDevice(device)); - } -} diff --git a/eigen/unsupported/test/cxx11_tensor_convolution_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_convolution_sycl.cpp deleted file mode 100644 index a4226a6..0000000 --- a/eigen/unsupported/test/cxx11_tensor_convolution_sycl.cpp +++ /dev/null @@ -1,469 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: <eigen@codeplay.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_convolution_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t -#define EIGEN_USE_SYCL - -#include <iostream> -#include <chrono> -#include <ctime> - -#include "main.h" -#include <unsupported/Eigen/CXX11/Tensor> -#include <iomanip> - -using Eigen::array; -using Eigen::SyclDevice; -using Eigen::Tensor; -using Eigen::TensorMap; -static const float error_threshold =1e-4f; - - -template <typename DataType, int DataLayout, typename IndexType> -static void test_larg_expr1D(const Eigen::SyclDevice& sycl_device) -{ - IndexType indim0 =53; - IndexType indim1= 55; - IndexType indim2= 51; - IndexType outdim0=50; - IndexType outdim1=55; - IndexType outdim2=51; - Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}}; - Eigen::array<IndexType, 1> kernel_dims = {{4}}; - Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}}; - - Tensor<DataType, 3, DataLayout, IndexType> input(input_dims); - Tensor<DataType, 1, DataLayout,IndexType> kernel(kernel_dims); - Tensor<DataType, 3, DataLayout,IndexType> result(result_dims); - Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims); - - Eigen::array<IndexType, 1> dims3{{0}}; - - input.setRandom(); - kernel.setRandom(); - result.setZero(); - result_host.setZero(); - - std::size_t input_bytes = input.size() * sizeof(DataType); - std::size_t kernel_bytes = kernel.size() * sizeof(DataType); - std::size_t result_bytes = result.size() * sizeof(DataType); - - DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes)); - DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes)); - DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes)); - - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims); - sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); - sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); - - gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3); - sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); - - result_host=input.convolve(kernel, dims3); - -for(IndexType i=0; i< outdim0; i++ ){ - for(IndexType j=0; j< outdim1; j++ ){ - for(IndexType k=0; k< outdim2; k++ ){ - if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) { - std::cout <<std::setprecision(16)<< "mismatch detected at index ( "<< i << " , " << j << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<< result_host(i,j,k) << std::endl; - assert(false); - } - } - } -} - sycl_device.deallocate(d_input); - sycl_device.deallocate(d_kernel); - sycl_device.deallocate(d_result); - -} - - -template <typename DataType, int DataLayout, typename IndexType> -static void test_larg_expr2D(const Eigen::SyclDevice& sycl_device) -{ - IndexType indim0 =53; - IndexType indim1= 55; - IndexType indim2= 51; - IndexType outdim0=50; - IndexType outdim1=51; - IndexType outdim2=51; - Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}}; - Eigen::array<IndexType, 2> kernel_dims = {{4,5}}; - Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}}; - - Tensor<DataType, 3, DataLayout, IndexType> input(input_dims); - Tensor<DataType, 2, DataLayout,IndexType> kernel(kernel_dims); - Tensor<DataType, 3, DataLayout,IndexType> result(result_dims); - Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims); - - Eigen::array<IndexType, 2> dims3{{0,1}}; - - input.setRandom(); - kernel.setRandom(); - result.setZero(); - result_host.setZero(); - - std::size_t input_bytes = input.size() * sizeof(DataType); - std::size_t kernel_bytes = kernel.size() * sizeof(DataType); - std::size_t result_bytes = result.size() * sizeof(DataType); - - DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes)); - DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes)); - DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes)); - - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims); - sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); - sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); - - gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3); - sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); - - result_host=input.convolve(kernel, dims3); - -for(IndexType i=0; i< outdim0; i++ ){ - for(IndexType j=0; j< outdim1; j++ ){ - for(IndexType k=0; k< outdim2; k++ ){ - if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) { - std::cout <<std::setprecision(16)<< "mismatch detected at index ( "<< i << " , " << j << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<< result_host(i,j,k) << std::endl; - assert(false); - } - } - } -} - sycl_device.deallocate(d_input); - sycl_device.deallocate(d_kernel); - sycl_device.deallocate(d_result); - -} - - -template <typename DataType, int DataLayout, typename IndexType> -static void test_larg_expr3D(const Eigen::SyclDevice& sycl_device) -{ - IndexType indim0 =53; - IndexType indim1= 55; - IndexType indim2= 51; - IndexType outdim0=50; - IndexType outdim1=51; - IndexType outdim2=49; - Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}}; - Eigen::array<IndexType, 3> kernel_dims = {{4,5,3}}; - Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}}; - - Tensor<DataType, 3, DataLayout, IndexType> input(input_dims); - Tensor<DataType, 3, DataLayout,IndexType> kernel(kernel_dims); - Tensor<DataType, 3, DataLayout,IndexType> result(result_dims); - Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims); - - Eigen::array<IndexType, 3> dims3{{0,1,2}}; - - input.setRandom(); - kernel.setRandom(); - result.setZero(); - result_host.setZero(); - - std::size_t input_bytes = input.size() * sizeof(DataType); - std::size_t kernel_bytes = kernel.size() * sizeof(DataType); - std::size_t result_bytes = result.size() * sizeof(DataType); - - DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes)); - DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes)); - DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes)); - - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims); - sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); - sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); - - gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3); - sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); - - result_host=input.convolve(kernel, dims3); - -for(IndexType i=0; i< outdim0; i++ ){ - for(IndexType j=0; j< outdim1; j++ ){ - for(IndexType k=0; k< outdim2; k++ ){ - if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) { - std::cout <<std::setprecision(16)<< "mismatch detected at index ( "<< i << " , " << j << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<< result_host(i,j,k) << std::endl; - assert(false); - } - } - } -} - sycl_device.deallocate(d_input); - sycl_device.deallocate(d_kernel); - sycl_device.deallocate(d_result); - -} - - -template <typename DataType, int DataLayout, typename IndexType> -static void test_evals(const Eigen::SyclDevice& sycl_device) -{ - Eigen::array<IndexType, 2> input_dims = {{3, 3}}; - Eigen::array<IndexType, 1> kernel_dims = {{2}}; - Eigen::array<IndexType, 2> result_dims = {{2, 3}}; - - Tensor<DataType, 2, DataLayout, IndexType> input(input_dims); - Tensor<DataType, 1, DataLayout,IndexType> kernel(kernel_dims); - Tensor<DataType, 2, DataLayout,IndexType> result(result_dims); - - Eigen::array<IndexType, 1> dims3{{0}}; - - input.setRandom(); - kernel.setRandom(); - result.setZero(); - - std::size_t input_bytes = input.size() * sizeof(DataType); - std::size_t kernel_bytes = kernel.size() * sizeof(DataType); - std::size_t result_bytes = result.size() * sizeof(DataType); - - DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes)); - DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes)); - DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes)); - - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_input(d_input, input_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_result(d_result, result_dims); - sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); - sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); - - gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3); - sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); - - VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0) + input(1,0)*kernel(1)); // index 0 - VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0) + input(1,1)*kernel(1)); // index 2 - VERIFY_IS_APPROX(result(0,2), input(0,2)*kernel(0) + input(1,2)*kernel(1)); // index 4 - VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0) + input(2,0)*kernel(1)); // index 1 - VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0) + input(2,1)*kernel(1)); // index 3 - VERIFY_IS_APPROX(result(1,2), input(1,2)*kernel(0) + input(2,2)*kernel(1)); // index 5 - - sycl_device.deallocate(d_input); - sycl_device.deallocate(d_kernel); - sycl_device.deallocate(d_result); -} - -template <typename DataType, int DataLayout, typename IndexType> -static void test_expr(const Eigen::SyclDevice& sycl_device) -{ - Eigen::array<IndexType, 2> input_dims = {{3, 3}}; - Eigen::array<IndexType, 2> kernel_dims = {{2, 2}}; - Eigen::array<IndexType, 2> result_dims = {{2, 2}}; - - Tensor<DataType, 2, DataLayout, IndexType> input(input_dims); - Tensor<DataType, 2, DataLayout, IndexType> kernel(kernel_dims); - Tensor<DataType, 2, DataLayout, IndexType> result(result_dims); - - input.setRandom(); - kernel.setRandom(); - Eigen::array<IndexType, 2> dims; - dims[0] = 0; - dims[1] = 1; - - std::size_t input_bytes = input.size() * sizeof(DataType); - std::size_t kernel_bytes = kernel.size() * sizeof(DataType); - std::size_t result_bytes = result.size() * sizeof(DataType); - - DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes)); - DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes)); - DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes)); - - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_input(d_input, input_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_result(d_result, result_dims); - sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); - sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); - - gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims); - sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); - - VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0,0) + input(0,1)*kernel(0,1) + - input(1,0)*kernel(1,0) + input(1,1)*kernel(1,1)); - VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0,0) + input(0,2)*kernel(0,1) + - input(1,1)*kernel(1,0) + input(1,2)*kernel(1,1)); - VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0,0) + input(1,1)*kernel(0,1) + - input(2,0)*kernel(1,0) + input(2,1)*kernel(1,1)); - VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0,0) + input(1,2)*kernel(0,1) + - input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1)); - - sycl_device.deallocate(d_input); - sycl_device.deallocate(d_kernel); - sycl_device.deallocate(d_result); -} - - -template <typename DataType, int DataLayout, typename IndexType> -static void test_modes(const Eigen::SyclDevice& sycl_device){ - -Eigen::array<IndexType, 1> input_dims = {{3}}; -Eigen::array<IndexType, 1> kernel_dims = {{3}}; - -Tensor<DataType, 1, DataLayout, IndexType> input(input_dims); -Tensor<DataType, 1, DataLayout, IndexType> kernel(kernel_dims); - -input.setRandom(); -kernel.setRandom(); -Eigen::array<IndexType, 1> dims; -dims[0] = 0; - - input(0) = 1.0f; - input(1) = 2.0f; - input(2) = 3.0f; - kernel(0) = 0.5f; - kernel(1) = 1.0f; - kernel(2) = 0.0f; - - Eigen::array<std::pair<IndexType, IndexType>, 1> padding; - - // Emulate VALID mode (as defined in - // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). - padding[0] = std::make_pair(0, 0); - Tensor<DataType, 1, DataLayout, IndexType> valid(1); - - std::size_t input_bytes = input.size() * sizeof(DataType); - std::size_t kernel_bytes = kernel.size() * sizeof(DataType); - std::size_t valid_bytes = valid.size() * sizeof(DataType); - - DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes)); - DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes)); - DataType * d_valid = static_cast<DataType*>(sycl_device.allocate(valid_bytes)); - - Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_input(d_input, input_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_valid(d_valid, valid.dimensions()); - sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); - sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); - - gpu_valid.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims); - sycl_device.memcpyDeviceToHost(valid.data(), d_valid, valid_bytes); - - VERIFY_IS_EQUAL(valid.dimension(0), 1); - VERIFY_IS_APPROX(valid(0), 2.5f); - - // Emulate SAME mode (as defined in - // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). - padding[0] = std::make_pair(1, 1); - Tensor<DataType, 1, DataLayout, IndexType> same(3); - std::size_t same_bytes = same.size() * sizeof(DataType); - DataType * d_same = static_cast<DataType*>(sycl_device.allocate(same_bytes)); - Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_same(d_same, same.dimensions()); - gpu_same.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims); - sycl_device.memcpyDeviceToHost(same.data(), d_same, same_bytes); - - VERIFY_IS_EQUAL(same.dimension(0), 3); - VERIFY_IS_APPROX(same(0), 1.0f); - VERIFY_IS_APPROX(same(1), 2.5f); - VERIFY_IS_APPROX(same(2), 4.0f); - - // Emulate FULL mode (as defined in - // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). - padding[0] = std::make_pair(2, 2); - - Tensor<DataType, 1, DataLayout, IndexType> full(5); - std::size_t full_bytes = full.size() * sizeof(DataType); - DataType * d_full = static_cast<DataType*>(sycl_device.allocate(full_bytes)); - Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_full(d_full, full.dimensions()); - gpu_full.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims); - sycl_device.memcpyDeviceToHost(full.data(), d_full, full_bytes); - - VERIFY_IS_EQUAL(full.dimension(0), 5); - VERIFY_IS_APPROX(full(0), 0.0f); - VERIFY_IS_APPROX(full(1), 1.0f); - VERIFY_IS_APPROX(full(2), 2.5f); - VERIFY_IS_APPROX(full(3), 4.0f); - VERIFY_IS_APPROX(full(4), 1.5f); - - sycl_device.deallocate(d_input); - sycl_device.deallocate(d_kernel); - sycl_device.deallocate(d_valid); - sycl_device.deallocate(d_same); - sycl_device.deallocate(d_full); - -} - -template <typename DataType, int DataLayout, typename IndexType> -static void test_strides(const Eigen::SyclDevice& sycl_device){ - - Eigen::array<IndexType, 1> input_dims = {{13}}; - Eigen::array<IndexType, 1> kernel_dims = {{3}}; - - Tensor<DataType, 1, DataLayout, IndexType> input(input_dims); - Tensor<DataType, 1, DataLayout, IndexType> kernel(kernel_dims); - Tensor<DataType, 1, DataLayout, IndexType> result(2); - - input.setRandom(); - kernel.setRandom(); - Eigen::array<IndexType, 1> dims; - dims[0] = 0; - - Eigen::array<IndexType, 1> stride_of_3; - stride_of_3[0] = 3; - Eigen::array<IndexType, 1> stride_of_2; - stride_of_2[0] = 2; - - std::size_t input_bytes = input.size() * sizeof(DataType); - std::size_t kernel_bytes = kernel.size() * sizeof(DataType); - std::size_t result_bytes = result.size() * sizeof(DataType); - - DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes)); - DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes)); - DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes)); - - Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_input(d_input, input_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_result(d_result, result.dimensions()); - sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); - sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); - - gpu_result.device(sycl_device)=gpu_input.stride(stride_of_3).convolve(gpu_kernel, dims).stride(stride_of_2); - sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); - - VERIFY_IS_EQUAL(result.dimension(0), 2); - VERIFY_IS_APPROX(result(0), (input(0)*kernel(0) + input(3)*kernel(1) + - input(6)*kernel(2))); - VERIFY_IS_APPROX(result(1), (input(6)*kernel(0) + input(9)*kernel(1) + - input(12)*kernel(2))); -} - -template <typename Dev_selector> void tensorConvolutionPerDevice(Dev_selector& s){ - QueueInterface queueInterface(s); - auto sycl_device=Eigen::SyclDevice(&queueInterface); - test_larg_expr1D<float, RowMajor, int64_t>(sycl_device); - test_larg_expr1D<float, ColMajor, int64_t>(sycl_device); - test_larg_expr2D<float, RowMajor, int64_t>(sycl_device); - test_larg_expr2D<float, ColMajor, int64_t>(sycl_device); - test_larg_expr3D<float, RowMajor, int64_t>(sycl_device); - test_larg_expr3D<float, ColMajor, int64_t>(sycl_device); - test_evals<float, ColMajor, int64_t>(sycl_device); - test_evals<float, RowMajor, int64_t>(sycl_device); - test_expr<float, ColMajor, int64_t>(sycl_device); - test_expr<float, RowMajor, int64_t>(sycl_device); - test_modes<float, ColMajor, int64_t>(sycl_device); - test_modes<float, RowMajor, int64_t>(sycl_device); - test_strides<float, ColMajor, int64_t>(sycl_device); - test_strides<float, RowMajor, int64_t>(sycl_device); -} - -void test_cxx11_tensor_convolution_sycl() { - for (const auto& device :Eigen::get_sycl_supported_devices()) { - CALL_SUBTEST(tensorConvolutionPerDevice(device)); - } -} diff --git a/eigen/unsupported/test/cxx11_tensor_device_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_device_sycl.cpp index 3ecc68d..7f79753 100644 --- a/eigen/unsupported/test/cxx11_tensor_device_sycl.cpp +++ b/eigen/unsupported/test/cxx11_tensor_device_sycl.cpp @@ -14,64 +14,18 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_device_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_SYCL #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> -#include <stdint.h> -#include <iostream> -template <typename DataType, int DataLayout, typename IndexType> -void test_device_memory(const Eigen::SyclDevice &sycl_device) { - std::cout << "Running on : " - << sycl_device.sycl_queue().get_device(). template get_info<cl::sycl::info::device::name>() - <<std::endl; - IndexType sizeDim1 = 100; - array<IndexType, 1> tensorRange = {{sizeDim1}}; - Tensor<DataType, 1, DataLayout,IndexType> in(tensorRange); - Tensor<DataType, 1, DataLayout,IndexType> in1(tensorRange); - memset(in1.data(), 1, in1.size() * sizeof(DataType)); - DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType))); - sycl_device.memset(gpu_in_data, 1, in.size()*sizeof(DataType)); - sycl_device.memcpyDeviceToHost(in.data(), gpu_in_data, in.size()*sizeof(DataType)); - for (IndexType i=0; i<in.size(); i++) { - VERIFY_IS_EQUAL(in(i), in1(i)); - } - sycl_device.deallocate(gpu_in_data); +void test_device_sycl(const Eigen::SyclDevice &sycl_device) { + std::cout <<"Helo from ComputeCpp: the requested device exists and the device name is : " + << sycl_device.m_queue.get_device(). template get_info<cl::sycl::info::device::name>() <<std::endl;; } - -template <typename DataType, int DataLayout, typename IndexType> -void test_device_exceptions(const Eigen::SyclDevice &sycl_device) { - VERIFY(sycl_device.ok()); - IndexType sizeDim1 = 100; - array<IndexType, 1> tensorDims = {{sizeDim1}}; - DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(sizeDim1*sizeof(DataType))); - sycl_device.memset(gpu_data, 1, sizeDim1*sizeof(DataType)); - - TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> in(gpu_data, tensorDims); - TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> out(gpu_data, tensorDims); - out.device(sycl_device) = in / in.constant(0); - - sycl_device.synchronize(); - VERIFY(!sycl_device.ok()); - sycl_device.deallocate(gpu_data); -} - -template<typename DataType> void sycl_device_test_per_device(const cl::sycl::device& d){ - std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl; - QueueInterface queueInterface(d); - auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_device_memory<DataType, RowMajor, int64_t>(sycl_device); - test_device_memory<DataType, ColMajor, int64_t>(sycl_device); - /// this test throw an exception. enable it if you want to see the exception - //test_device_exceptions<DataType, RowMajor>(sycl_device); - /// this test throw an exception. enable it if you want to see the exception - //test_device_exceptions<DataType, ColMajor>(sycl_device); -} - void test_cxx11_tensor_device_sycl() { - for (const auto& device :Eigen::get_sycl_supported_devices()) { - CALL_SUBTEST(sycl_device_test_per_device<float>(device)); - } + cl::sycl::gpu_selector s; + Eigen::SyclDevice sycl_device(s); + CALL_SUBTEST(test_device_sycl(sycl_device)); } diff --git a/eigen/unsupported/test/cxx11_tensor_expr.cpp b/eigen/unsupported/test/cxx11_tensor_expr.cpp index 129b4e6..77e24cb 100644 --- a/eigen/unsupported/test/cxx11_tensor_expr.cpp +++ b/eigen/unsupported/test/cxx11_tensor_expr.cpp @@ -300,51 +300,6 @@ static void test_select() } } -template <typename Scalar> -void test_minmax_nan_propagation_templ() { - for (int size = 1; size < 17; ++size) { - const Scalar kNan = std::numeric_limits<Scalar>::quiet_NaN(); - Tensor<Scalar, 1> vec_nan(size); - Tensor<Scalar, 1> vec_zero(size); - Tensor<Scalar, 1> vec_res(size); - vec_nan.setConstant(kNan); - vec_zero.setZero(); - vec_res.setZero(); - - // Test that we propagate NaNs in the tensor when applying the - // cwiseMax(scalar) operator, which is used for the Relu operator. - vec_res = vec_nan.cwiseMax(Scalar(0)); - for (int i = 0; i < size; ++i) { - VERIFY((numext::isnan)(vec_res(i))); - } - - // Test that NaNs do not propagate if we reverse the arguments. - vec_res = vec_zero.cwiseMax(kNan); - for (int i = 0; i < size; ++i) { - VERIFY_IS_EQUAL(vec_res(i), Scalar(0)); - } - - // Test that we propagate NaNs in the tensor when applying the - // cwiseMin(scalar) operator. - vec_res.setZero(); - vec_res = vec_nan.cwiseMin(Scalar(0)); - for (int i = 0; i < size; ++i) { - VERIFY((numext::isnan)(vec_res(i))); - } - - // Test that NaNs do not propagate if we reverse the arguments. - vec_res = vec_zero.cwiseMin(kNan); - for (int i = 0; i < size; ++i) { - VERIFY_IS_EQUAL(vec_res(i), Scalar(0)); - } - } -} - -static void test_minmax_nan_propagation() -{ - test_minmax_nan_propagation_templ<float>(); - test_minmax_nan_propagation_templ<double>(); -} void test_cxx11_tensor_expr() { @@ -356,5 +311,4 @@ void test_cxx11_tensor_expr() CALL_SUBTEST(test_functors()); CALL_SUBTEST(test_type_casting()); CALL_SUBTEST(test_select()); - CALL_SUBTEST(test_minmax_nan_propagation()); } diff --git a/eigen/unsupported/test/cxx11_tensor_fixed_size.cpp b/eigen/unsupported/test/cxx11_tensor_fixed_size.cpp index e6274f8..4c660de 100644 --- a/eigen/unsupported/test/cxx11_tensor_fixed_size.cpp +++ b/eigen/unsupported/test/cxx11_tensor_fixed_size.cpp @@ -21,7 +21,7 @@ static void test_0d() TensorFixedSize<float, Sizes<>, RowMajor> scalar2; VERIFY_IS_EQUAL(scalar1.rank(), 0); VERIFY_IS_EQUAL(scalar1.size(), 1); - VERIFY_IS_EQUAL(internal::array_prod(scalar1.dimensions()), 1); + VERIFY_IS_EQUAL(array_prod(scalar1.dimensions()), 1); scalar1() = 7.0; scalar2() = 13.0; diff --git a/eigen/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp index aca036c..5690da7 100644 --- a/eigen/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp +++ b/eigen/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp @@ -14,43 +14,43 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_forced_eval_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_SYCL #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> using Eigen::Tensor; -template <typename DataType, int DataLayout, typename IndexType> + void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) { - IndexType sizeDim1 = 100; - IndexType sizeDim2 = 20; - IndexType sizeDim3 = 20; - Eigen::array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; - Eigen::Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange); - Eigen::Tensor<DataType, 3, DataLayout, IndexType> in2(tensorRange); - Eigen::Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange); + int sizeDim1 = 100; + int sizeDim2 = 200; + int sizeDim3 = 200; + Eigen::array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + Eigen::Tensor<float, 3> in1(tensorRange); + Eigen::Tensor<float, 3> in2(tensorRange); + Eigen::Tensor<float, 3> out(tensorRange); - DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType))); - DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType))); - DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType))); + float * gpu_in1_data = static_cast<float*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(float))); + float * gpu_in2_data = static_cast<float*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(float))); + float * gpu_out_data = static_cast<float*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(float))); in1 = in1.random() + in1.constant(10.0f); in2 = in2.random() + in2.constant(10.0f); // creating TensorMap from tensor - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange); - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange); - Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange); - sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType)); - sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in1.dimensions().TotalSize())*sizeof(DataType)); + Eigen::TensorMap<Eigen::Tensor<float, 3>> gpu_in1(gpu_in1_data, tensorRange); + Eigen::TensorMap<Eigen::Tensor<float, 3>> gpu_in2(gpu_in2_data, tensorRange); + Eigen::TensorMap<Eigen::Tensor<float, 3>> gpu_out(gpu_out_data, tensorRange); + sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(float)); + sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in1.dimensions().TotalSize())*sizeof(float)); /// c=(a+b)*b gpu_out.device(sycl_device) =(gpu_in1 + gpu_in2).eval() * gpu_in2; - sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType)); - for (IndexType i = 0; i < sizeDim1; ++i) { - for (IndexType j = 0; j < sizeDim2; ++j) { - for (IndexType k = 0; k < sizeDim3; ++k) { + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float)); + for (int i = 0; i < sizeDim1; ++i) { + for (int j = 0; j < sizeDim2; ++j) { + for (int k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i, j, k), (in1(i, j, k) + in2(i, j, k)) * in2(i, j, k)); } @@ -63,14 +63,8 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) { } -template <typename DataType, typename Dev_selector> void tensorForced_evalperDevice(Dev_selector s){ - QueueInterface queueInterface(s); - auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_forced_eval_sycl<DataType, RowMajor, int64_t>(sycl_device); - test_forced_eval_sycl<DataType, ColMajor, int64_t>(sycl_device); -} void test_cxx11_tensor_forced_eval_sycl() { - for (const auto& device :Eigen::get_sycl_supported_devices()) { - CALL_SUBTEST(tensorForced_evalperDevice<float>(device)); - } + cl::sycl::gpu_selector s; + Eigen::SyclDevice sycl_device(s); + CALL_SUBTEST(test_forced_eval_sycl(sycl_device)); } diff --git a/eigen/unsupported/test/cxx11_tensor_morphing_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_morphing_sycl.cpp deleted file mode 100644 index 9b521bc..0000000 --- a/eigen/unsupported/test/cxx11_tensor_morphing_sycl.cpp +++ /dev/null @@ -1,248 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: <eigen@codeplay.com> -// Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_morphing_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t -#define EIGEN_USE_SYCL - - -#include "main.h" -#include <unsupported/Eigen/CXX11/Tensor> - -using Eigen::array; -using Eigen::SyclDevice; -using Eigen::Tensor; -using Eigen::TensorMap; - -template <typename DataType, int DataLayout, typename IndexType> -static void test_simple_reshape(const Eigen::SyclDevice& sycl_device) -{ - typename Tensor<DataType, 5 ,DataLayout, IndexType>::Dimensions dim1(2,3,1,7,1); - typename Tensor<DataType, 3 ,DataLayout, IndexType>::Dimensions dim2(2,3,7); - typename Tensor<DataType, 2 ,DataLayout, IndexType>::Dimensions dim3(6,7); - typename Tensor<DataType, 2 ,DataLayout, IndexType>::Dimensions dim4(2,21); - - Tensor<DataType, 5, DataLayout, IndexType> tensor1(dim1); - Tensor<DataType, 3, DataLayout, IndexType> tensor2(dim2); - Tensor<DataType, 2, DataLayout, IndexType> tensor3(dim3); - Tensor<DataType, 2, DataLayout, IndexType> tensor4(dim4); - - tensor1.setRandom(); - - DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType))); - DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType))); - DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(tensor3.size()*sizeof(DataType))); - DataType* gpu_data4 = static_cast<DataType*>(sycl_device.allocate(tensor4.size()*sizeof(DataType))); - - TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, dim1); - TensorMap<Tensor<DataType, 3,DataLayout, IndexType>> gpu2(gpu_data2, dim2); - TensorMap<Tensor<DataType, 2,DataLayout, IndexType>> gpu3(gpu_data3, dim3); - TensorMap<Tensor<DataType, 2,DataLayout, IndexType>> gpu4(gpu_data4, dim4); - - sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType)); - - gpu2.device(sycl_device)=gpu1.reshape(dim2); - sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor1.size())*sizeof(DataType)); - - gpu3.device(sycl_device)=gpu1.reshape(dim3); - sycl_device.memcpyDeviceToHost(tensor3.data(), gpu_data3,(tensor3.size())*sizeof(DataType)); - - gpu4.device(sycl_device)=gpu1.reshape(dim2).reshape(dim4); - sycl_device.memcpyDeviceToHost(tensor4.data(), gpu_data4,(tensor4.size())*sizeof(DataType)); - for (IndexType i = 0; i < 2; ++i){ - for (IndexType j = 0; j < 3; ++j){ - for (IndexType k = 0; k < 7; ++k){ - VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor2(i,j,k)); ///ColMajor - if (static_cast<int>(DataLayout) == static_cast<int>(ColMajor)) { - VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor3(i+2*j,k)); ///ColMajor - VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor4(i,j+3*k)); ///ColMajor - } - else{ - //VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor2(i,j,k)); /// RowMajor - VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor4(i,j*7 +k)); /// RowMajor - VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor3(i*3 +j,k)); /// RowMajor - } - } - } - } - sycl_device.deallocate(gpu_data1); - sycl_device.deallocate(gpu_data2); - sycl_device.deallocate(gpu_data3); - sycl_device.deallocate(gpu_data4); -} - - -template<typename DataType, int DataLayout, typename IndexType> -static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device) -{ - typename Tensor<DataType, 3, DataLayout, IndexType>::Dimensions dim1(2,3,7); - typename Tensor<DataType, 2, DataLayout, IndexType>::Dimensions dim2(6,7); - typename Tensor<DataType, 5, DataLayout, IndexType>::Dimensions dim3(2,3,1,7,1); - Tensor<DataType, 3, DataLayout, IndexType> tensor(dim1); - Tensor<DataType, 2, DataLayout, IndexType> tensor2d(dim2); - Tensor<DataType, 5, DataLayout, IndexType> tensor5d(dim3); - - tensor.setRandom(); - - DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType))); - DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(tensor2d.size()*sizeof(DataType))); - DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(tensor5d.size()*sizeof(DataType))); - - TensorMap< Tensor<DataType, 3, DataLayout, IndexType> > gpu1(gpu_data1, dim1); - TensorMap< Tensor<DataType, 2, DataLayout, IndexType> > gpu2(gpu_data2, dim2); - TensorMap< Tensor<DataType, 5, DataLayout, IndexType> > gpu3(gpu_data3, dim3); - - sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType)); - - gpu2.reshape(dim1).device(sycl_device)=gpu1; - sycl_device.memcpyDeviceToHost(tensor2d.data(), gpu_data2,(tensor2d.size())*sizeof(DataType)); - - gpu3.reshape(dim1).device(sycl_device)=gpu1; - sycl_device.memcpyDeviceToHost(tensor5d.data(), gpu_data3,(tensor5d.size())*sizeof(DataType)); - - - for (IndexType i = 0; i < 2; ++i){ - for (IndexType j = 0; j < 3; ++j){ - for (IndexType k = 0; k < 7; ++k){ - VERIFY_IS_EQUAL(tensor5d(i,j,0,k,0), tensor(i,j,k)); - if (static_cast<int>(DataLayout) == static_cast<int>(ColMajor)) { - VERIFY_IS_EQUAL(tensor2d(i+2*j,k), tensor(i,j,k)); ///ColMajor - } - else{ - VERIFY_IS_EQUAL(tensor2d(i*3 +j,k),tensor(i,j,k)); /// RowMajor - } - } - } - } - sycl_device.deallocate(gpu_data1); - sycl_device.deallocate(gpu_data2); - sycl_device.deallocate(gpu_data3); -} - - -template <typename DataType, int DataLayout, typename IndexType> -static void test_simple_slice(const Eigen::SyclDevice &sycl_device) -{ - IndexType sizeDim1 = 2; - IndexType sizeDim2 = 3; - IndexType sizeDim3 = 5; - IndexType sizeDim4 = 7; - IndexType sizeDim5 = 11; - array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; - Tensor<DataType, 5,DataLayout, IndexType> tensor(tensorRange); - tensor.setRandom(); - array<IndexType, 5> slice1_range ={{1, 1, 1, 1, 1}}; - Tensor<DataType, 5,DataLayout, IndexType> slice1(slice1_range); - - DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType))); - DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(slice1.size()*sizeof(DataType))); - TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, tensorRange); - TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu2(gpu_data2, slice1_range); - Eigen::DSizes<IndexType, 5> indices(1,2,3,4,5); - Eigen::DSizes<IndexType, 5> sizes(1,1,1,1,1); - sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType)); - gpu2.device(sycl_device)=gpu1.slice(indices, sizes); - sycl_device.memcpyDeviceToHost(slice1.data(), gpu_data2,(slice1.size())*sizeof(DataType)); - VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5)); - - - array<IndexType, 5> slice2_range ={{1,1,2,2,3}}; - Tensor<DataType, 5,DataLayout, IndexType> slice2(slice2_range); - DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(slice2.size()*sizeof(DataType))); - TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu3(gpu_data3, slice2_range); - Eigen::DSizes<IndexType, 5> indices2(1,1,3,4,5); - Eigen::DSizes<IndexType, 5> sizes2(1,1,2,2,3); - gpu3.device(sycl_device)=gpu1.slice(indices2, sizes2); - sycl_device.memcpyDeviceToHost(slice2.data(), gpu_data3,(slice2.size())*sizeof(DataType)); - for (IndexType i = 0; i < 2; ++i) { - for (IndexType j = 0; j < 2; ++j) { - for (IndexType k = 0; k < 3; ++k) { - VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k)); - } - } - } - sycl_device.deallocate(gpu_data1); - sycl_device.deallocate(gpu_data2); - sycl_device.deallocate(gpu_data3); -} - -template<typename DataType, int DataLayout, typename IndexType> -static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device) -{ - typedef Tensor<DataType, 2, DataLayout, IndexType> Tensor2f; - typedef Eigen::DSizes<IndexType, 2> Index2; - IndexType sizeDim1 = 7L; - IndexType sizeDim2 = 11L; - array<IndexType, 2> tensorRange = {{sizeDim1, sizeDim2}}; - Tensor<DataType, 2, DataLayout, IndexType> tensor(tensorRange),tensor2(tensorRange); - IndexType sliceDim1 = 2; - IndexType sliceDim2 = 3; - array<IndexType, 2> sliceRange = {{sliceDim1, sliceDim2}}; - Tensor2f slice(sliceRange); - Index2 strides(1L,1L); - Index2 indicesStart(3L,4L); - Index2 indicesStop(5L,7L); - Index2 lengths(2L,3L); - - DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType))); - DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType))); - DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(slice.size()*sizeof(DataType))); - TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu1(gpu_data1, tensorRange); - TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu2(gpu_data2, tensorRange); - TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu3(gpu_data3, sliceRange); - - - tensor.setRandom(); - sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType)); - gpu2.device(sycl_device)=gpu1; - - slice.setRandom(); - sycl_device.memcpyHostToDevice(gpu_data3, slice.data(),(slice.size())*sizeof(DataType)); - - - gpu1.slice(indicesStart,lengths).device(sycl_device)=gpu3; - gpu2.stridedSlice(indicesStart,indicesStop,strides).device(sycl_device)=gpu3; - sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data1,(tensor.size())*sizeof(DataType)); - sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType)); - - for(IndexType i=0;i<sizeDim1;i++) - for(IndexType j=0;j<sizeDim2;j++){ - VERIFY_IS_EQUAL(tensor(i,j), tensor2(i,j)); - } - sycl_device.deallocate(gpu_data1); - sycl_device.deallocate(gpu_data2); - sycl_device.deallocate(gpu_data3); -} - -template<typename DataType, typename dev_Selector> void sycl_morphing_test_per_device(dev_Selector s){ - QueueInterface queueInterface(s); - auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_simple_slice<DataType, RowMajor, int64_t>(sycl_device); - test_simple_slice<DataType, ColMajor, int64_t>(sycl_device); - test_simple_reshape<DataType, RowMajor, int64_t>(sycl_device); - test_simple_reshape<DataType, ColMajor, int64_t>(sycl_device); - test_reshape_as_lvalue<DataType, RowMajor, int64_t>(sycl_device); - test_reshape_as_lvalue<DataType, ColMajor, int64_t>(sycl_device); - test_strided_slice_write_sycl<DataType, ColMajor, int64_t>(sycl_device); - test_strided_slice_write_sycl<DataType, RowMajor, int64_t>(sycl_device); -} -void test_cxx11_tensor_morphing_sycl() -{ - for (const auto& device :Eigen::get_sycl_supported_devices()) { - CALL_SUBTEST(sycl_morphing_test_per_device<float>(device)); - } -} diff --git a/eigen/unsupported/test/cxx11_tensor_notification.cpp b/eigen/unsupported/test/cxx11_tensor_notification.cpp index 183ef02..c946007 100644 --- a/eigen/unsupported/test/cxx11_tensor_notification.cpp +++ b/eigen/unsupported/test/cxx11_tensor_notification.cpp @@ -13,6 +13,15 @@ #include "main.h" #include <Eigen/CXX11/Tensor> +#if EIGEN_OS_WIN || EIGEN_OS_WIN64 +#include <windows.h> +void sleep(int seconds) { + Sleep(seconds*1000); +} +#else +#include <unistd.h> +#endif + namespace { @@ -31,7 +40,7 @@ static void test_notification_single() Eigen::Notification n; std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter); thread_pool.Schedule(func); - EIGEN_SLEEP(1000); + sleep(1); // The thread should be waiting for the notification. VERIFY_IS_EQUAL(counter, 0); @@ -39,7 +48,7 @@ static void test_notification_single() // Unblock the thread n.Notify(); - EIGEN_SLEEP(1000); + sleep(1); // Verify the counter has been incremented VERIFY_IS_EQUAL(counter, 1); @@ -58,10 +67,10 @@ static void test_notification_multiple() thread_pool.Schedule(func); thread_pool.Schedule(func); thread_pool.Schedule(func); - EIGEN_SLEEP(1000); + sleep(1); VERIFY_IS_EQUAL(counter, 0); n.Notify(); - EIGEN_SLEEP(1000); + sleep(1); VERIFY_IS_EQUAL(counter, 4); } diff --git a/eigen/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/eigen/unsupported/test/cxx11_tensor_of_float16_cuda.cu index 908a5e5..2f86980 100644 --- a/eigen/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/eigen/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -200,8 +200,6 @@ void test_cuda_trancendental() { Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem); Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem); Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem); - Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_half(d_res3_half, num_elem); - Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_float(d_res3_float, num_elem); gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f); gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f); @@ -209,7 +207,6 @@ void test_cuda_trancendental() { gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>(); gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>(); gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>(); - gpu_res4_float.device(gpu_device) = gpu_float3.expm1().cast<Eigen::half>(); gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>(); gpu_res1_half.device(gpu_device) = gpu_res1_half.exp(); @@ -220,9 +217,6 @@ void test_cuda_trancendental() { gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>(); gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p(); - gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>(); - gpu_res3_half.device(gpu_device) = gpu_res3_half.expm1(); - Tensor<float, 1> input1(num_elem); Tensor<Eigen::half, 1> half_prec1(num_elem); Tensor<Eigen::half, 1> full_prec1(num_elem); diff --git a/eigen/unsupported/test/cxx11_tensor_padding_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_padding_sycl.cpp deleted file mode 100644 index dc748b7..0000000 --- a/eigen/unsupported/test/cxx11_tensor_padding_sycl.cpp +++ /dev/null @@ -1,157 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: <eigen@codeplay.com> -// Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_padding_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t -#define EIGEN_USE_SYCL - - -#include "main.h" -#include <unsupported/Eigen/CXX11/Tensor> - -using Eigen::array; -using Eigen::SyclDevice; -using Eigen::Tensor; -using Eigen::TensorMap; - - -template<typename DataType, int DataLayout, typename IndexType> -static void test_simple_padding(const Eigen::SyclDevice& sycl_device) -{ - - IndexType sizeDim1 = 2; - IndexType sizeDim2 = 3; - IndexType sizeDim3 = 5; - IndexType sizeDim4 = 7; - array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; - - Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange); - tensor.setRandom(); - - array<std::pair<IndexType, IndexType>, 4> paddings; - paddings[0] = std::make_pair(0, 0); - paddings[1] = std::make_pair(2, 1); - paddings[2] = std::make_pair(3, 4); - paddings[3] = std::make_pair(0, 0); - - IndexType padedSizeDim1 = 2; - IndexType padedSizeDim2 = 6; - IndexType padedSizeDim3 = 12; - IndexType padedSizeDim4 = 7; - array<IndexType, 4> padedtensorRange = {{padedSizeDim1, padedSizeDim2, padedSizeDim3, padedSizeDim4}}; - - Tensor<DataType, 4, DataLayout, IndexType> padded(padedtensorRange); - - - DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType))); - DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(padded.size()*sizeof(DataType))); - TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu1(gpu_data1, tensorRange); - TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu2(gpu_data2, padedtensorRange); - - VERIFY_IS_EQUAL(padded.dimension(0), 2+0); - VERIFY_IS_EQUAL(padded.dimension(1), 3+3); - VERIFY_IS_EQUAL(padded.dimension(2), 5+7); - VERIFY_IS_EQUAL(padded.dimension(3), 7+0); - sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType)); - gpu2.device(sycl_device)=gpu1.pad(paddings); - sycl_device.memcpyDeviceToHost(padded.data(), gpu_data2,(padded.size())*sizeof(DataType)); - for (IndexType i = 0; i < padedSizeDim1; ++i) { - for (IndexType j = 0; j < padedSizeDim2; ++j) { - for (IndexType k = 0; k < padedSizeDim3; ++k) { - for (IndexType l = 0; l < padedSizeDim4; ++l) { - if (j >= 2 && j < 5 && k >= 3 && k < 8) { - VERIFY_IS_EQUAL(padded(i,j,k,l), tensor(i,j-2,k-3,l)); - } else { - VERIFY_IS_EQUAL(padded(i,j,k,l), 0.0f); - } - } - } - } - } - sycl_device.deallocate(gpu_data1); - sycl_device.deallocate(gpu_data2); -} - -template<typename DataType, int DataLayout, typename IndexType> -static void test_padded_expr(const Eigen::SyclDevice& sycl_device) -{ - IndexType sizeDim1 = 2; - IndexType sizeDim2 = 3; - IndexType sizeDim3 = 5; - IndexType sizeDim4 = 7; - array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; - - Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange); - tensor.setRandom(); - - array<std::pair<IndexType, IndexType>, 4> paddings; - paddings[0] = std::make_pair(0, 0); - paddings[1] = std::make_pair(2, 1); - paddings[2] = std::make_pair(3, 4); - paddings[3] = std::make_pair(0, 0); - - Eigen::DSizes<IndexType, 2> reshape_dims; - reshape_dims[0] = 12; - reshape_dims[1] = 84; - - - Tensor<DataType, 2, DataLayout, IndexType> result(reshape_dims); - - DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType))); - DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(result.size()*sizeof(DataType))); - TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu1(gpu_data1, tensorRange); - TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu2(gpu_data2, reshape_dims); - - - sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType)); - gpu2.device(sycl_device)=gpu1.pad(paddings).reshape(reshape_dims); - sycl_device.memcpyDeviceToHost(result.data(), gpu_data2,(result.size())*sizeof(DataType)); - - for (IndexType i = 0; i < 2; ++i) { - for (IndexType j = 0; j < 6; ++j) { - for (IndexType k = 0; k < 12; ++k) { - for (IndexType l = 0; l < 7; ++l) { - const float result_value = DataLayout == ColMajor ? - result(i+2*j,k+12*l) : result(j+6*i,l+7*k); - if (j >= 2 && j < 5 && k >= 3 && k < 8) { - VERIFY_IS_EQUAL(result_value, tensor(i,j-2,k-3,l)); - } else { - VERIFY_IS_EQUAL(result_value, 0.0f); - } - } - } - } - } - sycl_device.deallocate(gpu_data1); - sycl_device.deallocate(gpu_data2); -} - -template<typename DataType, typename dev_Selector> void sycl_padding_test_per_device(dev_Selector s){ - QueueInterface queueInterface(s); - auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_simple_padding<DataType, RowMajor, int64_t>(sycl_device); - test_simple_padding<DataType, ColMajor, int64_t>(sycl_device); - test_padded_expr<DataType, RowMajor, int64_t>(sycl_device); - test_padded_expr<DataType, ColMajor, int64_t>(sycl_device); - -} -void test_cxx11_tensor_padding_sycl() -{ - for (const auto& device :Eigen::get_sycl_supported_devices()) { - CALL_SUBTEST(sycl_padding_test_per_device<float>(device)); - } -} diff --git a/eigen/unsupported/test/cxx11_tensor_reduction_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_reduction_sycl.cpp index 440d48b..a9ef829 100644 --- a/eigen/unsupported/test/cxx11_tensor_reduction_sycl.cpp +++ b/eigen/unsupported/test/cxx11_tensor_reduction_sycl.cpp @@ -14,168 +14,125 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_reduction_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_SYCL #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> -template <typename DataType, int DataLayout, typename IndexType> -static void test_full_reductions_mean_sycl(const Eigen::SyclDevice& sycl_device) { - const IndexType num_rows = 452; - const IndexType num_cols = 765; - array<IndexType, 2> tensorRange = {{num_rows, num_cols}}; +static void test_full_reductions_sycl(const Eigen::SyclDevice& sycl_device) { - Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange); - Tensor<DataType, 0, DataLayout, IndexType> full_redux; - Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu; - - in.setRandom(); - - full_redux = in.mean(); - - DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType)); - - TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange); - TensorMap<Tensor<DataType, 0, DataLayout, IndexType> > out_gpu(gpu_out_data); - - sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); - out_gpu.device(sycl_device) = in_gpu.mean(); - sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType)); - // Check that the CPU and GPU reductions return the same result. - VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); - sycl_device.deallocate(gpu_in_data); - sycl_device.deallocate(gpu_out_data); -} + const int num_rows = 452; + const int num_cols = 765; + array<int, 2> tensorRange = {{num_rows, num_cols}}; - -template <typename DataType, int DataLayout, typename IndexType> -static void test_full_reductions_min_sycl(const Eigen::SyclDevice& sycl_device) { - - const IndexType num_rows = 876; - const IndexType num_cols = 953; - array<IndexType, 2> tensorRange = {{num_rows, num_cols}}; - - Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange); - Tensor<DataType, 0, DataLayout, IndexType> full_redux; - Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu; + Tensor<float, 2> in(tensorRange); + Tensor<float, 0> full_redux; + Tensor<float, 0> full_redux_gpu; in.setRandom(); - full_redux = in.minimum(); + full_redux = in.sum(); - DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType)); + float* gpu_in_data = static_cast<float*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(float))); + float* gpu_out_data =(float*)sycl_device.allocate(sizeof(float)); - TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange); - TensorMap<Tensor<DataType, 0, DataLayout, IndexType> > out_gpu(gpu_out_data); + TensorMap<Tensor<float, 2> > in_gpu(gpu_in_data, tensorRange); + TensorMap<Tensor<float, 0> > out_gpu(gpu_out_data); - sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); - out_gpu.device(sycl_device) = in_gpu.minimum(); - sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(float)); + out_gpu.device(sycl_device) = in_gpu.sum(); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(float)); // Check that the CPU and GPU reductions return the same result. VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); + sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } +static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device) { -template <typename DataType, int DataLayout, typename IndexType> -static void test_first_dim_reductions_max_sycl(const Eigen::SyclDevice& sycl_device) { - - IndexType dim_x = 145; - IndexType dim_y = 1; - IndexType dim_z = 67; + int dim_x = 145; + int dim_y = 1; + int dim_z = 67; - array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}}; - Eigen::array<IndexType, 1> red_axis; + array<int, 3> tensorRange = {{dim_x, dim_y, dim_z}}; + Eigen::array<int, 1> red_axis; red_axis[0] = 0; - array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}}; + array<int, 2> reduced_tensorRange = {{dim_y, dim_z}}; - Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange); - Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange); - Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange); + Tensor<float, 3> in(tensorRange); + Tensor<float, 2> redux(reduced_tensorRange); + Tensor<float, 2> redux_gpu(reduced_tensorRange); in.setRandom(); - redux= in.maximum(red_axis); + redux= in.sum(red_axis); - DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType))); + float* gpu_in_data = static_cast<float*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(float))); + float* gpu_out_data = static_cast<float*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(float))); - TensorMap<Tensor<DataType, 3, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange); - TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > out_gpu(gpu_out_data, reduced_tensorRange); + TensorMap<Tensor<float, 3> > in_gpu(gpu_in_data, tensorRange); + TensorMap<Tensor<float, 2> > out_gpu(gpu_out_data, reduced_tensorRange); - sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); - out_gpu.device(sycl_device) = in_gpu.maximum(red_axis); - sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(float)); + out_gpu.device(sycl_device) = in_gpu.sum(red_axis); + sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(float)); // Check that the CPU and GPU reductions return the same result. - for(IndexType j=0; j<reduced_tensorRange[0]; j++ ) - for(IndexType k=0; k<reduced_tensorRange[1]; k++ ) + for(int j=0; j<reduced_tensorRange[0]; j++ ) + for(int k=0; k<reduced_tensorRange[1]; k++ ) VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k)); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } -template <typename DataType, int DataLayout, typename IndexType> -static void test_last_dim_reductions_sum_sycl(const Eigen::SyclDevice &sycl_device) { +static void test_last_dim_reductions_sycl(const Eigen::SyclDevice &sycl_device) { - IndexType dim_x = 567; - IndexType dim_y = 1; - IndexType dim_z = 47; + int dim_x = 567; + int dim_y = 1; + int dim_z = 47; - array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}}; - Eigen::array<IndexType, 1> red_axis; + array<int, 3> tensorRange = {{dim_x, dim_y, dim_z}}; + Eigen::array<int, 1> red_axis; red_axis[0] = 2; - array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}}; + array<int, 2> reduced_tensorRange = {{dim_x, dim_y}}; - Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange); - Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange); - Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange); + Tensor<float, 3> in(tensorRange); + Tensor<float, 2> redux(reduced_tensorRange); + Tensor<float, 2> redux_gpu(reduced_tensorRange); in.setRandom(); redux= in.sum(red_axis); - DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType))); + float* gpu_in_data = static_cast<float*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(float))); + float* gpu_out_data = static_cast<float*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(float))); - TensorMap<Tensor<DataType, 3, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange); - TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > out_gpu(gpu_out_data, reduced_tensorRange); + TensorMap<Tensor<float, 3> > in_gpu(gpu_in_data, tensorRange); + TensorMap<Tensor<float, 2> > out_gpu(gpu_out_data, reduced_tensorRange); - sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(float)); out_gpu.device(sycl_device) = in_gpu.sum(red_axis); - sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType)); + sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(float)); // Check that the CPU and GPU reductions return the same result. - for(IndexType j=0; j<reduced_tensorRange[0]; j++ ) - for(IndexType k=0; k<reduced_tensorRange[1]; k++ ) + for(int j=0; j<reduced_tensorRange[0]; j++ ) + for(int k=0; k<reduced_tensorRange[1]; k++ ) VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k)); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } -template<typename DataType> void sycl_reduction_test_per_device(const cl::sycl::device& d){ - std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl; - QueueInterface queueInterface(d); - auto sycl_device = Eigen::SyclDevice(&queueInterface); - - test_full_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device); - test_full_reductions_min_sycl<DataType, RowMajor, int64_t>(sycl_device); - test_first_dim_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device); - test_last_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device); - test_full_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device); - test_full_reductions_min_sycl<DataType, ColMajor, int64_t>(sycl_device); - test_first_dim_reductions_max_sycl<DataType, ColMajor, int64_t>(sycl_device); - test_last_dim_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device); -} + void test_cxx11_tensor_reduction_sycl() { - for (const auto& device :Eigen::get_sycl_supported_devices()) { - CALL_SUBTEST(sycl_reduction_test_per_device<float>(device)); - } + cl::sycl::gpu_selector s; + Eigen::SyclDevice sycl_device(s); + CALL_SUBTEST((test_full_reductions_sycl(sycl_device))); + CALL_SUBTEST((test_first_dim_reductions_sycl(sycl_device))); + CALL_SUBTEST((test_last_dim_reductions_sycl(sycl_device))); + } diff --git a/eigen/unsupported/test/cxx11_tensor_reverse_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_reverse_sycl.cpp deleted file mode 100644 index 2f54844..0000000 --- a/eigen/unsupported/test/cxx11_tensor_reverse_sycl.cpp +++ /dev/null @@ -1,221 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: <eigen@codeplay.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_reverse_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t -#define EIGEN_USE_SYCL - -#include "main.h" -#include <unsupported/Eigen/CXX11/Tensor> - - -template <typename DataType, int DataLayout, typename IndexType> -static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { - - IndexType dim1 = 2; - IndexType dim2 = 3; - IndexType dim3 = 5; - IndexType dim4 = 7; - - array<IndexType, 4> tensorRange = {{dim1, dim2, dim3, dim4}}; - Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange); - Tensor<DataType, 4, DataLayout, IndexType> reversed_tensor(tensorRange); - tensor.setRandom(); - - array<bool, 4> dim_rev; - dim_rev[0] = false; - dim_rev[1] = true; - dim_rev[2] = true; - dim_rev[3] = false; - - DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data =static_cast<DataType*>(sycl_device.allocate(reversed_tensor.dimensions().TotalSize()*sizeof(DataType))); - - TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange); - TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu(gpu_out_data, tensorRange); - - sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType)); - out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); - sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType)); - // Check that the CPU and GPU reductions return the same result. - for (IndexType i = 0; i < 2; ++i) { - for (IndexType j = 0; j < 3; ++j) { - for (IndexType k = 0; k < 5; ++k) { - for (IndexType l = 0; l < 7; ++l) { - VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(i,2-j,4-k,l)); - } - } - } - } - dim_rev[0] = true; - dim_rev[1] = false; - dim_rev[2] = false; - dim_rev[3] = false; - - out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); - sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType)); - - for (IndexType i = 0; i < 2; ++i) { - for (IndexType j = 0; j < 3; ++j) { - for (IndexType k = 0; k < 5; ++k) { - for (IndexType l = 0; l < 7; ++l) { - VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,l)); - } - } - } - } - - dim_rev[0] = true; - dim_rev[1] = false; - dim_rev[2] = false; - dim_rev[3] = true; - out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); - sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType)); - - for (IndexType i = 0; i < 2; ++i) { - for (IndexType j = 0; j < 3; ++j) { - for (IndexType k = 0; k < 5; ++k) { - for (IndexType l = 0; l < 7; ++l) { - VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,6-l)); - } - } - } - } - - sycl_device.deallocate(gpu_in_data); - sycl_device.deallocate(gpu_out_data); -} - - - -template <typename DataType, int DataLayout, typename IndexType> -static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue) -{ - IndexType dim1 = 2; - IndexType dim2 = 3; - IndexType dim3 = 5; - IndexType dim4 = 7; - - array<IndexType, 4> tensorRange = {{dim1, dim2, dim3, dim4}}; - Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange); - Tensor<DataType, 4, DataLayout, IndexType> expected(tensorRange); - Tensor<DataType, 4, DataLayout, IndexType> result(tensorRange); - tensor.setRandom(); - - array<bool, 4> dim_rev; - dim_rev[0] = false; - dim_rev[1] = true; - dim_rev[2] = false; - dim_rev[3] = true; - - DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data_expected =static_cast<DataType*>(sycl_device.allocate(expected.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data_result =static_cast<DataType*>(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType))); - - TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange); - TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_expected(gpu_out_data_expected, tensorRange); - TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_result(gpu_out_data_result, tensorRange); - - - sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType)); - - if (LValue) { - out_gpu_expected.reverse(dim_rev).device(sycl_device) = in_gpu; - } else { - out_gpu_expected.device(sycl_device) = in_gpu.reverse(dim_rev); - } - sycl_device.memcpyDeviceToHost(expected.data(), gpu_out_data_expected, expected.dimensions().TotalSize()*sizeof(DataType)); - - - array<IndexType, 4> src_slice_dim; - src_slice_dim[0] = 2; - src_slice_dim[1] = 3; - src_slice_dim[2] = 1; - src_slice_dim[3] = 7; - array<IndexType, 4> src_slice_start; - src_slice_start[0] = 0; - src_slice_start[1] = 0; - src_slice_start[2] = 0; - src_slice_start[3] = 0; - array<IndexType, 4> dst_slice_dim = src_slice_dim; - array<IndexType, 4> dst_slice_start = src_slice_start; - - for (IndexType i = 0; i < 5; ++i) { - if (LValue) { - out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) = - in_gpu.slice(src_slice_start, src_slice_dim); - } else { - out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) = - in_gpu.slice(src_slice_start, src_slice_dim).reverse(dim_rev); - } - src_slice_start[2] += 1; - dst_slice_start[2] += 1; - } - sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType)); - - for (IndexType i = 0; i < expected.dimension(0); ++i) { - for (IndexType j = 0; j < expected.dimension(1); ++j) { - for (IndexType k = 0; k < expected.dimension(2); ++k) { - for (IndexType l = 0; l < expected.dimension(3); ++l) { - VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); - } - } - } - } - - dst_slice_start[2] = 0; - result.setRandom(); - sycl_device.memcpyHostToDevice(gpu_out_data_result, result.data(),(result.dimensions().TotalSize())*sizeof(DataType)); - for (IndexType i = 0; i < 5; ++i) { - if (LValue) { - out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) = - in_gpu.slice(dst_slice_start, dst_slice_dim); - } else { - out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) = - in_gpu.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim); - } - dst_slice_start[2] += 1; - } - sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType)); - - for (IndexType i = 0; i < expected.dimension(0); ++i) { - for (IndexType j = 0; j < expected.dimension(1); ++j) { - for (IndexType k = 0; k < expected.dimension(2); ++k) { - for (IndexType l = 0; l < expected.dimension(3); ++l) { - VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); - } - } - } - } -} - - - -template<typename DataType> void sycl_reverse_test_per_device(const cl::sycl::device& d){ - std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl; - QueueInterface queueInterface(d); - auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_simple_reverse<DataType, RowMajor, int64_t>(sycl_device); - test_simple_reverse<DataType, ColMajor, int64_t>(sycl_device); - test_expr_reverse<DataType, RowMajor, int64_t>(sycl_device, false); - test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, false); - test_expr_reverse<DataType, RowMajor, int64_t>(sycl_device, true); - test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, true); -} -void test_cxx11_tensor_reverse_sycl() { - for (const auto& device :Eigen::get_sycl_supported_devices()) { - CALL_SUBTEST(sycl_reverse_test_per_device<float>(device)); - } -} diff --git a/eigen/unsupported/test/cxx11_tensor_shuffling_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_shuffling_sycl.cpp deleted file mode 100644 index c88db7c..0000000 --- a/eigen/unsupported/test/cxx11_tensor_shuffling_sycl.cpp +++ /dev/null @@ -1,119 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: <eigen@codeplay.com> -// Benoit Steiner <benoit.steiner.goog@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_shuffling_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t -#define EIGEN_USE_SYCL - - -#include "main.h" -#include <unsupported/Eigen/CXX11/Tensor> - -using Eigen::array; -using Eigen::SyclDevice; -using Eigen::Tensor; -using Eigen::TensorMap; - -template <typename DataType, int DataLayout, typename IndexType> -static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) -{ - IndexType sizeDim1 = 2; - IndexType sizeDim2 = 3; - IndexType sizeDim3 = 5; - IndexType sizeDim4 = 7; - array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; - Tensor<DataType, 4, DataLayout,IndexType> tensor(tensorRange); - Tensor<DataType, 4, DataLayout,IndexType> no_shuffle(tensorRange); - tensor.setRandom(); - - const size_t buffSize =tensor.size()*sizeof(DataType); - array<IndexType, 4> shuffles; - shuffles[0] = 0; - shuffles[1] = 1; - shuffles[2] = 2; - shuffles[3] = 3; - DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(buffSize)); - DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(buffSize)); - - - TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu1(gpu_data1, tensorRange); - TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu2(gpu_data2, tensorRange); - - sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(), buffSize); - - gpu2.device(sycl_device)=gpu1.shuffle(shuffles); - sycl_device.memcpyDeviceToHost(no_shuffle.data(), gpu_data2, buffSize); - sycl_device.synchronize(); - - VERIFY_IS_EQUAL(no_shuffle.dimension(0), sizeDim1); - VERIFY_IS_EQUAL(no_shuffle.dimension(1), sizeDim2); - VERIFY_IS_EQUAL(no_shuffle.dimension(2), sizeDim3); - VERIFY_IS_EQUAL(no_shuffle.dimension(3), sizeDim4); - - for (IndexType i = 0; i < sizeDim1; ++i) { - for (IndexType j = 0; j < sizeDim2; ++j) { - for (IndexType k = 0; k < sizeDim3; ++k) { - for (IndexType l = 0; l < sizeDim4; ++l) { - VERIFY_IS_EQUAL(tensor(i,j,k,l), no_shuffle(i,j,k,l)); - } - } - } - } - - shuffles[0] = 2; - shuffles[1] = 3; - shuffles[2] = 1; - shuffles[3] = 0; - array<IndexType, 4> tensorrangeShuffle = {{sizeDim3, sizeDim4, sizeDim2, sizeDim1}}; - Tensor<DataType, 4, DataLayout,IndexType> shuffle(tensorrangeShuffle); - DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(buffSize)); - TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu3(gpu_data3, tensorrangeShuffle); - - gpu3.device(sycl_device)=gpu1.shuffle(shuffles); - sycl_device.memcpyDeviceToHost(shuffle.data(), gpu_data3, buffSize); - sycl_device.synchronize(); - - VERIFY_IS_EQUAL(shuffle.dimension(0), sizeDim3); - VERIFY_IS_EQUAL(shuffle.dimension(1), sizeDim4); - VERIFY_IS_EQUAL(shuffle.dimension(2), sizeDim2); - VERIFY_IS_EQUAL(shuffle.dimension(3), sizeDim1); - - for (IndexType i = 0; i < sizeDim1; ++i) { - for (IndexType j = 0; j < sizeDim2; ++j) { - for (IndexType k = 0; k < sizeDim3; ++k) { - for (IndexType l = 0; l < sizeDim4; ++l) { - VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i)); - } - } - } - } -} - - -template<typename DataType, typename dev_Selector> void sycl_shuffling_test_per_device(dev_Selector s){ - QueueInterface queueInterface(s); - auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_simple_shuffling_sycl<DataType, RowMajor, int64_t>(sycl_device); - test_simple_shuffling_sycl<DataType, ColMajor, int64_t>(sycl_device); - -} -void test_cxx11_tensor_shuffling_sycl() -{ - for (const auto& device :Eigen::get_sycl_supported_devices()) { - CALL_SUBTEST(sycl_shuffling_test_per_device<float>(device)); - } -} diff --git a/eigen/unsupported/test/cxx11_tensor_striding_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_striding_sycl.cpp deleted file mode 100644 index 603c374..0000000 --- a/eigen/unsupported/test/cxx11_tensor_striding_sycl.cpp +++ /dev/null @@ -1,203 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: <eigen@codeplay.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_striding_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t -#define EIGEN_USE_SYCL - -#include <iostream> -#include <chrono> -#include <ctime> - -#include "main.h" -#include <unsupported/Eigen/CXX11/Tensor> - -using Eigen::array; -using Eigen::SyclDevice; -using Eigen::Tensor; -using Eigen::TensorMap; - - -template <typename DataType, int DataLayout, typename IndexType> -static void test_simple_striding(const Eigen::SyclDevice& sycl_device) -{ - - Eigen::array<IndexType, 4> tensor_dims = {{2,3,5,7}}; - Eigen::array<IndexType, 4> stride_dims = {{1,1,3,3}}; - - - Tensor<DataType, 4, DataLayout, IndexType> tensor(tensor_dims); - Tensor<DataType, 4, DataLayout,IndexType> no_stride(tensor_dims); - Tensor<DataType, 4, DataLayout,IndexType> stride(stride_dims); - - - std::size_t tensor_bytes = tensor.size() * sizeof(DataType); - std::size_t no_stride_bytes = no_stride.size() * sizeof(DataType); - std::size_t stride_bytes = stride.size() * sizeof(DataType); - DataType * d_tensor = static_cast<DataType*>(sycl_device.allocate(tensor_bytes)); - DataType * d_no_stride = static_cast<DataType*>(sycl_device.allocate(no_stride_bytes)); - DataType * d_stride = static_cast<DataType*>(sycl_device.allocate(stride_bytes)); - - Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_tensor(d_tensor, tensor_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_no_stride(d_no_stride, tensor_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_stride(d_stride, stride_dims); - - - tensor.setRandom(); - array<IndexType, 4> strides; - strides[0] = 1; - strides[1] = 1; - strides[2] = 1; - strides[3] = 1; - sycl_device.memcpyHostToDevice(d_tensor, tensor.data(), tensor_bytes); - gpu_no_stride.device(sycl_device)=gpu_tensor.stride(strides); - sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes); - - //no_stride = tensor.stride(strides); - - VERIFY_IS_EQUAL(no_stride.dimension(0), 2); - VERIFY_IS_EQUAL(no_stride.dimension(1), 3); - VERIFY_IS_EQUAL(no_stride.dimension(2), 5); - VERIFY_IS_EQUAL(no_stride.dimension(3), 7); - - for (IndexType i = 0; i < 2; ++i) { - for (IndexType j = 0; j < 3; ++j) { - for (IndexType k = 0; k < 5; ++k) { - for (IndexType l = 0; l < 7; ++l) { - VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l)); - } - } - } - } - - strides[0] = 2; - strides[1] = 4; - strides[2] = 2; - strides[3] = 3; -//Tensor<float, 4, DataLayout> stride; -// stride = tensor.stride(strides); - - gpu_stride.device(sycl_device)=gpu_tensor.stride(strides); - sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes); - - VERIFY_IS_EQUAL(stride.dimension(0), 1); - VERIFY_IS_EQUAL(stride.dimension(1), 1); - VERIFY_IS_EQUAL(stride.dimension(2), 3); - VERIFY_IS_EQUAL(stride.dimension(3), 3); - - for (IndexType i = 0; i < 1; ++i) { - for (IndexType j = 0; j < 1; ++j) { - for (IndexType k = 0; k < 3; ++k) { - for (IndexType l = 0; l < 3; ++l) { - VERIFY_IS_EQUAL(tensor(2*i,4*j,2*k,3*l), stride(i,j,k,l)); - } - } - } - } - - sycl_device.deallocate(d_tensor); - sycl_device.deallocate(d_no_stride); - sycl_device.deallocate(d_stride); -} - -template <typename DataType, int DataLayout, typename IndexType> -static void test_striding_as_lvalue(const Eigen::SyclDevice& sycl_device) -{ - - Eigen::array<IndexType, 4> tensor_dims = {{2,3,5,7}}; - Eigen::array<IndexType, 4> stride_dims = {{3,12,10,21}}; - - - Tensor<DataType, 4, DataLayout, IndexType> tensor(tensor_dims); - Tensor<DataType, 4, DataLayout,IndexType> no_stride(stride_dims); - Tensor<DataType, 4, DataLayout,IndexType> stride(stride_dims); - - - std::size_t tensor_bytes = tensor.size() * sizeof(DataType); - std::size_t no_stride_bytes = no_stride.size() * sizeof(DataType); - std::size_t stride_bytes = stride.size() * sizeof(DataType); - - DataType * d_tensor = static_cast<DataType*>(sycl_device.allocate(tensor_bytes)); - DataType * d_no_stride = static_cast<DataType*>(sycl_device.allocate(no_stride_bytes)); - DataType * d_stride = static_cast<DataType*>(sycl_device.allocate(stride_bytes)); - - Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_tensor(d_tensor, tensor_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_no_stride(d_no_stride, stride_dims); - Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_stride(d_stride, stride_dims); - - //Tensor<float, 4, DataLayout> tensor(2,3,5,7); - tensor.setRandom(); - array<IndexType, 4> strides; - strides[0] = 2; - strides[1] = 4; - strides[2] = 2; - strides[3] = 3; - -// Tensor<float, 4, DataLayout> result(3, 12, 10, 21); -// result.stride(strides) = tensor; - sycl_device.memcpyHostToDevice(d_tensor, tensor.data(), tensor_bytes); - gpu_stride.stride(strides).device(sycl_device)=gpu_tensor; - sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes); - - for (IndexType i = 0; i < 2; ++i) { - for (IndexType j = 0; j < 3; ++j) { - for (IndexType k = 0; k < 5; ++k) { - for (IndexType l = 0; l < 7; ++l) { - VERIFY_IS_EQUAL(tensor(i,j,k,l), stride(2*i,4*j,2*k,3*l)); - } - } - } - } - - array<IndexType, 4> no_strides; - no_strides[0] = 1; - no_strides[1] = 1; - no_strides[2] = 1; - no_strides[3] = 1; -// Tensor<float, 4, DataLayout> result2(3, 12, 10, 21); -// result2.stride(strides) = tensor.stride(no_strides); - - gpu_no_stride.stride(strides).device(sycl_device)=gpu_tensor.stride(no_strides); - sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes); - - for (IndexType i = 0; i < 2; ++i) { - for (IndexType j = 0; j < 3; ++j) { - for (IndexType k = 0; k < 5; ++k) { - for (IndexType l = 0; l < 7; ++l) { - VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(2*i,4*j,2*k,3*l)); - } - } - } - } - sycl_device.deallocate(d_tensor); - sycl_device.deallocate(d_no_stride); - sycl_device.deallocate(d_stride); -} - - -template <typename Dev_selector> void tensorStridingPerDevice(Dev_selector& s){ - QueueInterface queueInterface(s); - auto sycl_device=Eigen::SyclDevice(&queueInterface); - test_simple_striding<float, ColMajor, int64_t>(sycl_device); - test_simple_striding<float, RowMajor, int64_t>(sycl_device); - test_striding_as_lvalue<float, ColMajor, int64_t>(sycl_device); - test_striding_as_lvalue<float, RowMajor, int64_t>(sycl_device); -} - -void test_cxx11_tensor_striding_sycl() { - for (const auto& device :Eigen::get_sycl_supported_devices()) { - CALL_SUBTEST(tensorStridingPerDevice(device)); - } -} diff --git a/eigen/unsupported/test/cxx11_tensor_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_sycl.cpp index 5cd0f4c..6a9c334 100644 --- a/eigen/unsupported/test/cxx11_tensor_sycl.cpp +++ b/eigen/unsupported/test/cxx11_tensor_sycl.cpp @@ -16,7 +16,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_SYCL #include "main.h" @@ -27,105 +27,36 @@ using Eigen::SyclDevice; using Eigen::Tensor; using Eigen::TensorMap; -template <typename DataType, int DataLayout, typename IndexType> -void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) { - IndexType sizeDim1 = 100; - IndexType sizeDim2 = 10; - IndexType sizeDim3 = 20; - array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; - Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange); - Tensor<DataType, 3, DataLayout, IndexType> out1(tensorRange); - Tensor<DataType, 3, DataLayout, IndexType> out2(tensorRange); - Tensor<DataType, 3, DataLayout, IndexType> out3(tensorRange); +void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) { - in1 = in1.random(); - - DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType))); - DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(out1.size()*sizeof(DataType))); - - TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange); - TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange); - - sycl_device.memcpyHostToDevice(gpu_data1, in1.data(),(in1.size())*sizeof(DataType)); - sycl_device.memcpyHostToDevice(gpu_data2, in1.data(),(in1.size())*sizeof(DataType)); - gpu1.device(sycl_device) = gpu1 * 3.14f; - gpu2.device(sycl_device) = gpu2 * 2.7f; - sycl_device.memcpyDeviceToHost(out1.data(), gpu_data1,(out1.size())*sizeof(DataType)); - sycl_device.memcpyDeviceToHost(out2.data(), gpu_data1,(out2.size())*sizeof(DataType)); - sycl_device.memcpyDeviceToHost(out3.data(), gpu_data2,(out3.size())*sizeof(DataType)); - sycl_device.synchronize(); - - for (IndexType i = 0; i < in1.size(); ++i) { - VERIFY_IS_APPROX(out1(i), in1(i) * 3.14f); - VERIFY_IS_APPROX(out2(i), in1(i) * 3.14f); - VERIFY_IS_APPROX(out3(i), in1(i) * 2.7f); - } - - sycl_device.deallocate(gpu_data1); - sycl_device.deallocate(gpu_data2); -} - -template <typename DataType, int DataLayout, typename IndexType> -void test_sycl_mem_sync(const Eigen::SyclDevice &sycl_device) { - IndexType size = 20; - array<IndexType, 1> tensorRange = {{size}}; - Tensor<DataType, 1, DataLayout, IndexType> in1(tensorRange); - Tensor<DataType, 1, DataLayout, IndexType> in2(tensorRange); - Tensor<DataType, 1, DataLayout, IndexType> out(tensorRange); - - in1 = in1.random(); - in2 = in1; - - DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType))); - - TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> gpu1(gpu_data, tensorRange); - sycl_device.memcpyHostToDevice(gpu_data, in1.data(),(in1.size())*sizeof(DataType)); - sycl_device.synchronize(); - in1.setZero(); - - sycl_device.memcpyDeviceToHost(out.data(), gpu_data, out.size()*sizeof(DataType)); - sycl_device.synchronize(); - - for (IndexType i = 0; i < in1.size(); ++i) { - VERIFY_IS_APPROX(out(i), in2(i)); - } - - sycl_device.deallocate(gpu_data); -} - -template <typename DataType, int DataLayout, typename IndexType> -void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { - - IndexType sizeDim1 = 100; - IndexType sizeDim2 = 10; - IndexType sizeDim3 = 20; - array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; - Tensor<DataType, 3,DataLayout, IndexType> in1(tensorRange); - Tensor<DataType, 3,DataLayout, IndexType> in2(tensorRange); - Tensor<DataType, 3,DataLayout, IndexType> in3(tensorRange); - Tensor<DataType, 3,DataLayout, IndexType> out(tensorRange); + int sizeDim1 = 100; + int sizeDim2 = 100; + int sizeDim3 = 100; + array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + Tensor<float, 3> in1(tensorRange); + Tensor<float, 3> in2(tensorRange); + Tensor<float, 3> in3(tensorRange); + Tensor<float, 3> out(tensorRange); in2 = in2.random(); in3 = in3.random(); - DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType))); - DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(in2.size()*sizeof(DataType))); - DataType * gpu_in3_data = static_cast<DataType*>(sycl_device.allocate(in3.size()*sizeof(DataType))); - DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType))); + float * gpu_in1_data = static_cast<float*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(float))); + float * gpu_in2_data = static_cast<float*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(float))); + float * gpu_in3_data = static_cast<float*>(sycl_device.allocate(in3.dimensions().TotalSize()*sizeof(float))); + float * gpu_out_data = static_cast<float*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(float))); - TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange); - TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange); - TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in3(gpu_in3_data, tensorRange); - TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange); + TensorMap<Tensor<float, 3>> gpu_in1(gpu_in1_data, tensorRange); + TensorMap<Tensor<float, 3>> gpu_in2(gpu_in2_data, tensorRange); + TensorMap<Tensor<float, 3>> gpu_in3(gpu_in3_data, tensorRange); + TensorMap<Tensor<float, 3>> gpu_out(gpu_out_data, tensorRange); /// a=1.2f gpu_in1.device(sycl_device) = gpu_in1.constant(1.2f); - sycl_device.memcpyDeviceToHost(in1.data(), gpu_in1_data ,(in1.size())*sizeof(DataType)); - sycl_device.synchronize(); - - for (IndexType i = 0; i < sizeDim1; ++i) { - for (IndexType j = 0; j < sizeDim2; ++j) { - for (IndexType k = 0; k < sizeDim3; ++k) { + sycl_device.memcpyDeviceToHost(in1.data(), gpu_in1_data ,(in1.dimensions().TotalSize())*sizeof(float)); + for (int i = 0; i < sizeDim1; ++i) { + for (int j = 0; j < sizeDim2; ++j) { + for (int k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(in1(i,j,k), 1.2f); } } @@ -134,12 +65,10 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { /// a=b*1.2f gpu_out.device(sycl_device) = gpu_in1 * 1.2f; - sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data ,(out.size())*sizeof(DataType)); - sycl_device.synchronize(); - - for (IndexType i = 0; i < sizeDim1; ++i) { - for (IndexType j = 0; j < sizeDim2; ++j) { - for (IndexType k = 0; k < sizeDim3; ++k) { + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data ,(out.dimensions().TotalSize())*sizeof(float)); + for (int i = 0; i < sizeDim1; ++i) { + for (int j = 0; j < sizeDim2; ++j) { + for (int k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) * 1.2f); } @@ -148,14 +77,12 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { printf("a=b*1.2f Test Passed\n"); /// c=a*b - sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.size())*sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(float)); gpu_out.device(sycl_device) = gpu_in1 * gpu_in2; - sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType)); - sycl_device.synchronize(); - - for (IndexType i = 0; i < sizeDim1; ++i) { - for (IndexType j = 0; j < sizeDim2; ++j) { - for (IndexType k = 0; k < sizeDim3; ++k) { + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float)); + for (int i = 0; i < sizeDim1; ++i) { + for (int j = 0; j < sizeDim2; ++j) { + for (int k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) * in2(i,j,k)); @@ -166,11 +93,10 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { /// c=a+b gpu_out.device(sycl_device) = gpu_in1 + gpu_in2; - sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType)); - sycl_device.synchronize(); - for (IndexType i = 0; i < sizeDim1; ++i) { - for (IndexType j = 0; j < sizeDim2; ++j) { - for (IndexType k = 0; k < sizeDim3; ++k) { + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float)); + for (int i = 0; i < sizeDim1; ++i) { + for (int j = 0; j < sizeDim2; ++j) { + for (int k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k)); @@ -181,11 +107,10 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { /// c=a*a gpu_out.device(sycl_device) = gpu_in1 * gpu_in1; - sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType)); - sycl_device.synchronize(); - for (IndexType i = 0; i < sizeDim1; ++i) { - for (IndexType j = 0; j < sizeDim2; ++j) { - for (IndexType k = 0; k < sizeDim3; ++k) { + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float)); + for (int i = 0; i < sizeDim1; ++i) { + for (int j = 0; j < sizeDim2; ++j) { + for (int k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) * in1(i,j,k)); @@ -196,11 +121,10 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { //a*3.14f + b*2.7f gpu_out.device(sycl_device) = gpu_in1 * gpu_in1.constant(3.14f) + gpu_in2 * gpu_in2.constant(2.7f); - sycl_device.memcpyDeviceToHost(out.data(),gpu_out_data,(out.size())*sizeof(DataType)); - sycl_device.synchronize(); - for (IndexType i = 0; i < sizeDim1; ++i) { - for (IndexType j = 0; j < sizeDim2; ++j) { - for (IndexType k = 0; k < sizeDim3; ++k) { + sycl_device.memcpyDeviceToHost(out.data(),gpu_out_data,(out.dimensions().TotalSize())*sizeof(float)); + for (int i = 0; i < sizeDim1; ++i) { + for (int j = 0; j < sizeDim2; ++j) { + for (int k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) * 3.14f + in2(i,j,k) * 2.7f); @@ -210,13 +134,12 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { printf("a*3.14f + b*2.7f Test Passed\n"); ///d= (a>0.5? b:c) - sycl_device.memcpyHostToDevice(gpu_in3_data, in3.data(),(in3.size())*sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_in3_data, in3.data(),(in3.dimensions().TotalSize())*sizeof(float)); gpu_out.device(sycl_device) =(gpu_in1 > gpu_in1.constant(0.5f)).select(gpu_in2, gpu_in3); - sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType)); - sycl_device.synchronize(); - for (IndexType i = 0; i < sizeDim1; ++i) { - for (IndexType j = 0; j < sizeDim2; ++j) { - for (IndexType k = 0; k < sizeDim3; ++k) { + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float)); + for (int i = 0; i < sizeDim1; ++i) { + for (int j = 0; j < sizeDim2; ++j) { + for (int k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i, j, k), (in1(i, j, k) > 0.5f) ? in2(i, j, k) : in3(i, j, k)); @@ -229,48 +152,8 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { sycl_device.deallocate(gpu_in3_data); sycl_device.deallocate(gpu_out_data); } -template<typename Scalar1, typename Scalar2, int DataLayout, typename IndexType> -static void test_sycl_cast(const Eigen::SyclDevice& sycl_device){ - IndexType size = 20; - array<IndexType, 1> tensorRange = {{size}}; - Tensor<Scalar1, 1, DataLayout, IndexType> in(tensorRange); - Tensor<Scalar2, 1, DataLayout, IndexType> out(tensorRange); - Tensor<Scalar2, 1, DataLayout, IndexType> out_host(tensorRange); - - in = in.random(); - - Scalar1* gpu_in_data = static_cast<Scalar1*>(sycl_device.allocate(in.size()*sizeof(Scalar1))); - Scalar2 * gpu_out_data = static_cast<Scalar2*>(sycl_device.allocate(out.size()*sizeof(Scalar2))); - - TensorMap<Tensor<Scalar1, 1, DataLayout, IndexType>> gpu_in(gpu_in_data, tensorRange); - TensorMap<Tensor<Scalar2, 1, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange); - sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.size())*sizeof(Scalar1)); - gpu_out.device(sycl_device) = gpu_in. template cast<Scalar2>(); - sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data, out.size()*sizeof(Scalar2)); - out_host = in. template cast<Scalar2>(); - for(IndexType i=0; i< size; i++) - { - VERIFY_IS_APPROX(out(i), out_host(i)); - } - printf("cast Test Passed\n"); - sycl_device.deallocate(gpu_in_data); - sycl_device.deallocate(gpu_out_data); -} -template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){ - QueueInterface queueInterface(s); - auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_sycl_mem_transfers<DataType, RowMajor, int64_t>(sycl_device); - test_sycl_computations<DataType, RowMajor, int64_t>(sycl_device); - test_sycl_mem_sync<DataType, RowMajor, int64_t>(sycl_device); - test_sycl_mem_transfers<DataType, ColMajor, int64_t>(sycl_device); - test_sycl_computations<DataType, ColMajor, int64_t>(sycl_device); - test_sycl_mem_sync<DataType, ColMajor, int64_t>(sycl_device); - test_sycl_cast<DataType, int, RowMajor, int64_t>(sycl_device); - test_sycl_cast<DataType, int, ColMajor, int64_t>(sycl_device); -} - void test_cxx11_tensor_sycl() { - for (const auto& device :Eigen::get_sycl_supported_devices()) { - CALL_SUBTEST(sycl_computing_test_per_device<float>(device)); - } + cl::sycl::gpu_selector s; + Eigen::SyclDevice sycl_device(s); + CALL_SUBTEST(test_sycl_cpu(sycl_device)); } diff --git a/eigen/unsupported/test/polynomialsolver.cpp b/eigen/unsupported/test/polynomialsolver.cpp index 7ad4aa6..0c87478 100644 --- a/eigen/unsupported/test/polynomialsolver.cpp +++ b/eigen/unsupported/test/polynomialsolver.cpp @@ -32,10 +32,9 @@ bool aux_evalSolver( const POLYNOMIAL& pols, SOLVER& psolve ) { typedef typename POLYNOMIAL::Index Index; typedef typename POLYNOMIAL::Scalar Scalar; - typedef typename POLYNOMIAL::RealScalar RealScalar; typedef typename SOLVER::RootsType RootsType; - typedef Matrix<RealScalar,Deg,1> EvalRootsType; + typedef Matrix<Scalar,Deg,1> EvalRootsType; const Index deg = pols.size()-1; @@ -58,7 +57,7 @@ bool aux_evalSolver( const POLYNOMIAL& pols, SOLVER& psolve ) cerr << endl; } - std::vector<RealScalar> rootModuli( roots.size() ); + std::vector<Scalar> rootModuli( roots.size() ); Map< EvalRootsType > aux( &rootModuli[0], roots.size() ); aux = roots.array().abs(); std::sort( rootModuli.begin(), rootModuli.end() ); @@ -84,7 +83,7 @@ void evalSolver( const POLYNOMIAL& pols ) { typedef typename POLYNOMIAL::Scalar Scalar; - typedef PolynomialSolver<Scalar, Deg > PolynomialSolverType; + typedef PolynomialSolver<Scalar, Deg > PolynomialSolverType; PolynomialSolverType psolve; aux_evalSolver<Deg, POLYNOMIAL, PolynomialSolverType>( pols, psolve ); @@ -98,7 +97,6 @@ void evalSolverSugarFunction( const POLYNOMIAL& pols, const ROOTS& roots, const { using std::sqrt; typedef typename POLYNOMIAL::Scalar Scalar; - typedef typename POLYNOMIAL::RealScalar RealScalar; typedef PolynomialSolver<Scalar, Deg > PolynomialSolverType; @@ -109,12 +107,15 @@ void evalSolverSugarFunction( const POLYNOMIAL& pols, const ROOTS& roots, const // 1) the roots found are correct // 2) the roots have distinct moduli + typedef typename POLYNOMIAL::Scalar Scalar; + typedef typename REAL_ROOTS::Scalar Real; + //Test realRoots - std::vector< RealScalar > calc_realRoots; - psolve.realRoots( calc_realRoots, test_precision<RealScalar>()); - VERIFY_IS_EQUAL( calc_realRoots.size() , (size_t)real_roots.size() ); + std::vector< Real > calc_realRoots; + psolve.realRoots( calc_realRoots ); + VERIFY( calc_realRoots.size() == (size_t)real_roots.size() ); - const RealScalar psPrec = sqrt( test_precision<RealScalar>() ); + const Scalar psPrec = sqrt( test_precision<Scalar>() ); for( size_t i=0; i<calc_realRoots.size(); ++i ) { @@ -137,7 +138,7 @@ void evalSolverSugarFunction( const POLYNOMIAL& pols, const ROOTS& roots, const bool hasRealRoot; //Test absGreatestRealRoot - RealScalar r = psolve.absGreatestRealRoot( hasRealRoot ); + Real r = psolve.absGreatestRealRoot( hasRealRoot ); VERIFY( hasRealRoot == (real_roots.size() > 0 ) ); if( hasRealRoot ){ VERIFY( internal::isApprox( real_roots.array().abs().maxCoeff(), abs(r), psPrec ) ); } @@ -166,11 +167,9 @@ void evalSolverSugarFunction( const POLYNOMIAL& pols, const ROOTS& roots, const template<typename _Scalar, int _Deg> void polynomialsolver(int deg) { - typedef typename NumTraits<_Scalar>::Real RealScalar; - typedef internal::increment_if_fixed_size<_Deg> Dim; + typedef internal::increment_if_fixed_size<_Deg> Dim; typedef Matrix<_Scalar,Dim::ret,1> PolynomialType; typedef Matrix<_Scalar,_Deg,1> EvalRootsType; - typedef Matrix<RealScalar,_Deg,1> RealRootsType; cout << "Standard cases" << endl; PolynomialType pols = PolynomialType::Random(deg+1); @@ -183,11 +182,15 @@ void polynomialsolver(int deg) evalSolver<_Deg,PolynomialType>( pols ); cout << "Test sugar" << endl; - RealRootsType realRoots = RealRootsType::Random(deg); + EvalRootsType realRoots = EvalRootsType::Random(deg); roots_to_monicPolynomial( realRoots, pols ); evalSolverSugarFunction<_Deg>( pols, - realRoots.template cast <std::complex<RealScalar> >().eval(), + realRoots.template cast < + std::complex< + typename NumTraits<_Scalar>::Real + > + >(), realRoots ); } @@ -211,6 +214,5 @@ void test_polynomialsolver() internal::random<int>(9,13) )) ); CALL_SUBTEST_11((polynomialsolver<float,Dynamic>(1)) ); - CALL_SUBTEST_12((polynomialsolver<std::complex<double>,Dynamic>(internal::random<int>(2,13))) ); } } diff --git a/eigen/unsupported/test/sparse_extra.cpp b/eigen/unsupported/test/sparse_extra.cpp index 4f6723d..a010ceb 100644 --- a/eigen/unsupported/test/sparse_extra.cpp +++ b/eigen/unsupported/test/sparse_extra.cpp @@ -129,19 +129,6 @@ template<typename SparseMatrixType> void sparse_extra(const SparseMatrixType& re } -template<typename SparseMatrixType> -void check_marketio() -{ - typedef Matrix<typename SparseMatrixType::Scalar, Dynamic, Dynamic> DenseMatrix; - Index rows = internal::random<Index>(1,100); - Index cols = internal::random<Index>(1,100); - SparseMatrixType m1, m2; - m1 = DenseMatrix::Random(rows, cols).sparseView(); - saveMarket(m1, "sparse_extra.mtx"); - loadMarket(m2, "sparse_extra.mtx"); - VERIFY_IS_EQUAL(DenseMatrix(m1),DenseMatrix(m2)); -} - void test_sparse_extra() { for(int i = 0; i < g_repeat; i++) { @@ -156,15 +143,5 @@ void test_sparse_extra() CALL_SUBTEST_3( (sparse_product<DynamicSparseMatrix<float, ColMajor> >()) ); CALL_SUBTEST_3( (sparse_product<DynamicSparseMatrix<float, RowMajor> >()) ); - - CALL_SUBTEST_4( (check_marketio<SparseMatrix<float,ColMajor,int> >()) ); - CALL_SUBTEST_4( (check_marketio<SparseMatrix<double,ColMajor,int> >()) ); - CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<float>,ColMajor,int> >()) ); - CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<double>,ColMajor,int> >()) ); - CALL_SUBTEST_4( (check_marketio<SparseMatrix<float,ColMajor,long int> >()) ); - CALL_SUBTEST_4( (check_marketio<SparseMatrix<double,ColMajor,long int> >()) ); - CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<float>,ColMajor,long int> >()) ); - CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<double>,ColMajor,long int> >()) ); - TEST_SET_BUT_UNUSED_VARIABLE(s); } } |