From 88534ba623421c956d8ffcda2d27f41d704d15ef Mon Sep 17 00:00:00 2001 From: Stanislaw Halik Date: Tue, 3 Jul 2018 07:37:12 +0200 Subject: update eigen --- eigen/unsupported/Eigen/CXX11/Tensor | 9 +- eigen/unsupported/Eigen/CXX11/ThreadPool | 13 - eigen/unsupported/Eigen/CXX11/src/Tensor/README.md | 8 +- .../Eigen/CXX11/src/Tensor/TensorBase.h | 6 - .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 30 +- .../Eigen/CXX11/src/Tensor/TensorConcatenation.h | 6 - .../Eigen/CXX11/src/Tensor/TensorContraction.h | 287 +------------ .../CXX11/src/Tensor/TensorContractionBlocking.h | 134 ------ .../Eigen/CXX11/src/Tensor/TensorContractionCuda.h | 101 ++--- .../CXX11/src/Tensor/TensorContractionMapper.h | 76 ++-- .../Eigen/CXX11/src/Tensor/TensorContractionSycl.h | 400 ----------------- .../CXX11/src/Tensor/TensorContractionThreadPool.h | 208 +-------- .../Eigen/CXX11/src/Tensor/TensorConversion.h | 3 - .../Eigen/CXX11/src/Tensor/TensorConvolution.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h | 476 --------------------- .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 7 +- .../Eigen/CXX11/src/Tensor/TensorDeviceDefault.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 413 +++--------------- .../CXX11/src/Tensor/TensorDeviceThreadPool.h | 13 +- .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 12 +- .../Eigen/CXX11/src/Tensor/TensorEvalTo.h | 5 +- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 11 +- .../unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 41 +- .../CXX11/src/Tensor/TensorForwardDeclarations.h | 14 +- .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorIntDiv.h | 12 +- .../Eigen/CXX11/src/Tensor/TensorMacros.h | 8 - .../Eigen/CXX11/src/Tensor/TensorMeta.h | 5 +- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 53 +-- .../Eigen/CXX11/src/Tensor/TensorPadding.h | 7 - .../Eigen/CXX11/src/Tensor/TensorReduction.h | 30 +- .../Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 1 + .../Eigen/CXX11/src/Tensor/TensorReductionSycl.h | 179 +++++--- .../Eigen/CXX11/src/Tensor/TensorReverse.h | 5 - .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 10 +- .../Eigen/CXX11/src/Tensor/TensorStorage.h | 8 +- .../Eigen/CXX11/src/Tensor/TensorStriding.h | 16 +- .../Eigen/CXX11/src/Tensor/TensorSycl.h | 12 - .../Tensor/TensorSyclConvertToDeviceExpression.h | 54 +-- .../CXX11/src/Tensor/TensorSyclExprConstructor.h | 188 +------- .../CXX11/src/Tensor/TensorSyclExtractAccessor.h | 265 +++++------- .../CXX11/src/Tensor/TensorSyclExtractFunctors.h | 322 ++++---------- .../Eigen/CXX11/src/Tensor/TensorSyclFunctors.h | 245 ----------- .../Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h | 138 ++---- .../CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h | 57 +-- .../Eigen/CXX11/src/Tensor/TensorSyclRun.h | 77 ++-- .../Eigen/CXX11/src/Tensor/TensorSyclTuple.h | 2 - .../Eigen/CXX11/src/Tensor/TensorTraits.h | 6 - .../Eigen/CXX11/src/Tensor/TensorUInt128.h | 1 - .../CXX11/src/ThreadPool/NonBlockingThreadPool.h | 142 ++---- .../Eigen/CXX11/src/ThreadPool/RunQueue.h | 7 - .../Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h | 8 - .../Eigen/CXX11/src/ThreadPool/ThreadCancel.h | 23 - .../Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h | 2 - .../CXX11/src/ThreadPool/ThreadPoolInterface.h | 6 - eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h | 6 +- .../Eigen/CXX11/src/util/EmulateArray.h | 13 +- .../Eigen/src/AutoDiff/AutoDiffScalar.h | 7 + .../Eigen/src/EulerAngles/EulerAngles.h | 257 ++++++----- .../Eigen/src/EulerAngles/EulerSystem.h | 184 ++++---- .../Eigen/src/MatrixFunctions/MatrixFunction.h | 11 +- .../Eigen/src/MatrixFunctions/MatrixLogarithm.h | 2 +- .../unsupported/Eigen/src/Polynomials/Companion.h | 50 +-- .../Eigen/src/Polynomials/PolynomialSolver.h | 18 +- eigen/unsupported/Eigen/src/SparseExtra/MarketIO.h | 89 ++-- .../src/SpecialFunctions/SpecialFunctionsImpl.h | 8 +- 68 files changed, 1044 insertions(+), 3779 deletions(-) delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h (limited to 'eigen/unsupported/Eigen') diff --git a/eigen/unsupported/Eigen/CXX11/Tensor b/eigen/unsupported/Eigen/CXX11/Tensor index 3991609..7ecb4c7 100644 --- a/eigen/unsupported/Eigen/CXX11/Tensor +++ b/eigen/unsupported/Eigen/CXX11/Tensor @@ -13,14 +13,13 @@ #include "../../../Eigen/Core" -#if defined(EIGEN_USE_SYCL) +#ifdef EIGEN_USE_SYCL #undef min #undef max #undef isnan #undef isinf #undef isfinite #include -#include #include #include #include @@ -53,10 +52,8 @@ typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; -#include #else #include -#include #endif #if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900 @@ -71,10 +68,6 @@ typedef unsigned __int64 uint64_t; #include #endif -#if defined(EIGEN_USE_LIBXSMM) -#include "libxsmm.h" -#endif - #ifdef EIGEN_USE_THREADS #include "ThreadPool" #endif diff --git a/eigen/unsupported/Eigen/CXX11/ThreadPool b/eigen/unsupported/Eigen/CXX11/ThreadPool index c346141..09d637e 100644 --- a/eigen/unsupported/Eigen/CXX11/ThreadPool +++ b/eigen/unsupported/Eigen/CXX11/ThreadPool @@ -50,7 +50,6 @@ #include "src/ThreadPool/ThreadLocal.h" #include "src/ThreadPool/ThreadYield.h" -#include "src/ThreadPool/ThreadCancel.h" #include "src/ThreadPool/EventCount.h" #include "src/ThreadPool/RunQueue.h" #include "src/ThreadPool/ThreadPoolInterface.h" @@ -58,18 +57,6 @@ #include "src/ThreadPool/SimpleThreadPool.h" #include "src/ThreadPool/NonBlockingThreadPool.h" - -// Use the more efficient NonBlockingThreadPool by default. -namespace Eigen { -#ifndef EIGEN_USE_SIMPLE_THREAD_POOL -template using ThreadPoolTempl = NonBlockingThreadPoolTempl; -typedef NonBlockingThreadPool ThreadPool; -#else -template using ThreadPoolTempl = SimpleThreadPoolTempl; -typedef SimpleThreadPool ThreadPool; -#endif -} // namespace Eigen - #endif #include diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md b/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md index 38cdb9c..98e8381 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md @@ -1737,9 +1737,11 @@ TODO ## Representation of scalar values -Scalar values are often represented by tensors of size 1 and rank 0.For example -Tensor::maximum() currently returns a Tensor. Similarly, the inner -product of 2 1d tensors (through contractions) returns a 0d tensor. +Scalar values are often represented by tensors of size 1 and rank 1. It would be +more logical and user friendly to use tensors of rank 0 instead. For example +Tensor::maximum() currently returns a Tensor. Similarly, the inner +product of 2 1d tensors (through contractions) returns a 1d tensor. In the +future these operations might be updated to return 0d tensors instead. ## Limitations diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index fbe3408..7a45a5c 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -185,12 +185,6 @@ class TensorBase return unaryExpr(internal::scalar_exp_op()); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - expm1() const { - return unaryExpr(internal::scalar_expm1_op()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> log() const { diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 23a7446..4cfe300 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -54,7 +54,7 @@ struct is_input_scalar > { static const bool value = true; }; #ifndef EIGEN_EMULATE_CXX11_META_H -template +template struct is_input_scalar > { static const bool value = (Sizes::total_size == 1); }; diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index c46a778..1ba7ef1 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -50,7 +50,6 @@ template struct DimensionId { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) { - EIGEN_UNUSED_VARIABLE(dim); eigen_assert(dim == DimId); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { @@ -151,7 +150,7 @@ struct TensorEvaluator, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device), m_offset(op.offset()) + : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device) { EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); eigen_assert(NumInputDims > m_dim.actualDim()); @@ -207,7 +206,7 @@ struct TensorEvaluator, Device> eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(m_stride == 1); Index inputIndex = index * m_inputStride + m_inputOffset; @@ -219,7 +218,7 @@ struct TensorEvaluator, Device> PacketReturnType rslt = internal::pload(values); return rslt; } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims - 1) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { // m_stride is aways greater than index, so let's avoid the integer division. eigen_assert(m_stride > index); return m_impl.template packet(index + m_inputOffset); @@ -275,29 +274,17 @@ struct TensorEvaluator, Device> } } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex dimId() const { - return m_dim.actualDim(); - } - - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DenseIndex& offset() const { - return m_offset; - } - /// required by sycl in order to extract the accessor - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } - protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex; if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(m_stride == 1); inputIndex = index * m_inputStride + m_inputOffset; } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims-1) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { // m_stride is aways greater than index, so let's avoid the integer division. eigen_assert(m_stride > index); inputIndex = index + m_inputOffset; @@ -317,9 +304,6 @@ struct TensorEvaluator, Device> TensorEvaluator m_impl; const internal::DimensionId m_dim; const Device& m_device; -// required by sycl - const DenseIndex m_offset; - }; @@ -360,7 +344,7 @@ struct TensorEvaluator, Device> EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) if ((static_cast(this->Layout) == static_cast(ColMajor) && this->m_dim.actualDim() == 0) || - (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) { + (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(this->m_stride == 1); EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; @@ -371,7 +355,7 @@ struct TensorEvaluator, Device> inputIndex += this->m_inputStride; } } else if ((static_cast(this->Layout) == static_cast(ColMajor) && this->m_dim.actualDim() == NumInputDims-1) || - (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == 0)) { + (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == 0)) { // m_stride is aways greater than index, so let's avoid the integer division. eigen_assert(this->m_stride > index); this->m_impl.template writePacket(index + this->m_inputOffset, x); diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 2c7ba96..59bf90d 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -276,12 +276,6 @@ struct TensorEvaluator& left_impl() const { return m_leftImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& right_impl() const { return m_rightImpl; } - /// required by sycl in order to extract the accessor - const Axis& axis() const { return m_axis; } protected: Dimensions m_dimensions; diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index bf4a476..20b29e5 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -20,70 +20,6 @@ namespace Eigen { * */ namespace internal { -#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) -template -void pack_simple(Scalar * dst, const Scalar * src, Index cols, Index rows, Index lddst, Index ldsrc) { - size_t psize = packet_traits::size; // Packet size - typedef typename packet_traits::type Packet; // Packet type - size_t alignment = psize*sizeof(Scalar); // Needed alignment - if (rows % psize == 0 && (lddst*sizeof(Scalar)) % alignment == 0 && - (ldsrc*sizeof(Scalar)) % alignment == 0 && - reinterpret_cast(src) % alignment == 0 && - reinterpret_cast(dst) % alignment == 0) { - // Optimized version using packets - size_t num_packets = rows / psize; - for (Index col = 0; col < cols; ++col) { - EIGEN_ASM_COMMENT("begin pack_simple inner copy"); - // Unrolled manually 4 times. - for (size_t i=0; i < num_packets/4; ++i) { - internal::pstore(dst, internal::pload(src)); - dst += psize; src += psize; - internal::pstore(dst, internal::pload(src)); - dst += psize; src += psize; - internal::pstore(dst, internal::pload(src)); - dst += psize; src += psize; - internal::pstore(dst, internal::pload(src)); - dst += psize; src += psize; - } - for (size_t i=0; i < num_packets%4; ++i) { - internal::pstore(dst, internal::pload(src)); - dst += psize; src += psize; - } - dst += lddst - num_packets*psize; - src += ldsrc - num_packets*psize; - EIGEN_ASM_COMMENT("end pack_simple inner copy"); - } - } else { - // Naive memcpy calls - for (Index col = 0; col < cols; ++col) { - memcpy(dst + col*lddst, src + col*ldsrc, rows*sizeof(Scalar)); - } - } -} - -template - struct libxsmm_wrapper { - libxsmm_wrapper() {} - libxsmm_wrapper(int, int, int, int, int, int, int, float, float, int) {} - void operator()(const LhsScalar*, const RhsScalar*, Scalar*) {} - void operator()(const LhsScalar*, const RhsScalar*, Scalar*, const LhsScalar*, const RhsScalar*, const Scalar*) {} - }; - - template<> - struct libxsmm_wrapper: public libxsmm_mmfunction { - libxsmm_wrapper(): libxsmm_mmfunction() {} - libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) : - libxsmm_mmfunction(flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch) {} - }; - - template<> - struct libxsmm_wrapper: public libxsmm_mmfunction { - libxsmm_wrapper(): libxsmm_mmfunction() {} - libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) : - libxsmm_mmfunction(flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch) {} - }; -#endif - template struct traits > @@ -222,7 +158,7 @@ struct TensorContractionEvaluatorBase m_device(device), m_result(NULL) { EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == - static_cast(TensorEvaluator::Layout)), + static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -381,8 +317,6 @@ struct TensorContractionEvaluatorBase } } - EnableXSMMIfPossible(eval_op_indices); - // If the layout is RowMajor, we need to reverse the m_dimensions if (static_cast(Layout) == static_cast(RowMajor)) { for (int i = 0, j = NumDims - 1; i < j; i++, j--) { @@ -393,7 +327,7 @@ struct TensorContractionEvaluatorBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar * data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { m_leftImpl.evalSubExprsIfNeeded(NULL); m_rightImpl.evalSubExprsIfNeeded(NULL); if (data) { @@ -488,13 +422,6 @@ struct TensorContractionEvaluatorBase template EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { - #if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) - if (m_can_use_xsmm) { - evalGemmXSMM(buffer); - return; - } - #endif - // columns in left side, rows in right side const Index k = this->m_k_size; @@ -611,214 +538,7 @@ struct TensorContractionEvaluatorBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; } -protected: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void EnableXSMMIfPossible(const array, ContractDims>& eval_op_indices) { - m_can_use_xsmm = false; - -#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; - if (!std::is_same::value || - !std::is_same::value || - !(std::is_same::value || - std::is_same::value) || - m_leftImpl.data() == NULL || - m_rightImpl.data() == NULL) { - return; - } - - // Check if we can use faster matmul algorithms. For contraction to be - // equivalent to matmul, we need both lhs and rhs contracting dims sequences - // to be either a prefix or suffix of all dims. Also, the order of both - // must be the same, so we don't have to do reordering. - // For example: - // * OK: lhs 4D, rhs 4D, contraction: [(0, 2), (1, 3)] - // * BAD: lhs 3D, rhs 3D, contraction: [(1,1)] - // * BAD: lhs 3D, rhs 3D, contraction: [(0, 0), (2, 2)] - // * BAD: lhs 3D, rhs 3D, contraction: [(0, 2), (1, 1)] - // Depending if contraction dims are prefix or suffix of all dims we need to - // pre-transpose matrices in matmul algorithm: - // lhs: prefix -> transpose, suffix -> no transpose - // rhs: prefix -> no transpose, suffix -> transpose - // For example, for lhs 2D, rhs 2D, contraction [(1, 0)] is regular, - // non-transposed matmul. - if (ContractDims == 0) { - // This case is totally uninteresting, filter it out to avoid problems - // with iterations in further tests. - return; - } - - // Check if RHS dims list is increasing. LHS already is, so if not, the - // order is different and we cannot do matmul. - for (int i = 1; i < ContractDims; i++) { - if (eval_op_indices[i].second < eval_op_indices[i-1].second) { - return; - } - } - - // Check if no holes. - int diff; - for (int i = 1; i < ContractDims; i++) { - // LHS contract dims are sorted to form an increasing seq. - diff = eval_op_indices[i].first - eval_op_indices[i-1].first; - if (diff != 1) { - return; - } - // Now we may already assume RHS contract dims seq is increasing too. - diff = eval_op_indices[i].second - eval_op_indices[i-1].second; - if (diff != 1) { - return; - } - } - - // Check if suffix or prefix. - if (eval_op_indices[0].first != 0 && - eval_op_indices[ContractDims-1].first != LDims-1) { - return; - } - if (eval_op_indices[0].second != 0 && - eval_op_indices[ContractDims-1].second != RDims-1) { - return; - } - - m_can_use_xsmm = true; -#else - EIGEN_UNUSED_VARIABLE(eval_op_indices); -#endif - } - -#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) - EIGEN_DEVICE_FUNC void evalGemmXSMM(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; - - // rows in left side - const Index m = this->m_i_size; - - // columns in right side - const Index n = this->m_j_size; - - const bool transposeA = !m_lhs_inner_dim_contiguous; - const bool transposeB = !m_rhs_inner_dim_contiguous; - - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; - - internal::TensorXsmmContractionBlocking blocking( - k, m, n, 1, transposeA, transposeB); - - // Outer blocks sizes - const Index mc_outer = blocking.outer_m(); - const Index nc_outer = blocking.outer_n(); - const Index kc_outer = blocking.outer_k(); - // Inner blocks sizes - const Index mc = blocking.mc(); - const Index nc = blocking.nc(); - const Index kc = blocking.kc(); - // Decisions whether we should copy parts of matrices - const bool copyA = blocking.copyA(); - const bool copyB = blocking.copyB(); - - const LhsScalar* leftData = m_leftImpl.data(); - const RhsScalar* rightData = m_rightImpl.data(); - - const libxsmm_blasint stride_A = static_cast(transposeA ? k : m); - const libxsmm_blasint stride_B = static_cast(transposeB ? n : k); - const libxsmm_blasint stride_C = static_cast(m); - - const libxsmm_blasint stride_blockA = static_cast(mc); - // Use bigger stride to avoid hitting same cache line too often. - // This consistently gives +~0.5 Gflops. - const libxsmm_blasint stride_panelB = static_cast( - kc % 32 == 0 ? kc + 16 : kc - ); - - // Kernel for the general case (not edges) - internal::libxsmm_wrapper kernel; - - LhsScalar* blockA = NULL; - RhsScalar* panelB = NULL; - - if (copyA) { - blockA = static_cast(this->m_device.allocate(mc * kc * sizeof(LhsScalar))); - } - if (copyB) { - panelB = static_cast(this->m_device.allocate(nc_outer * stride_panelB * sizeof(RhsScalar))); - } - - const Index kernel_stride_A = copyA ? stride_blockA : stride_A; - const Index kernel_stride_B = copyB ? stride_panelB : stride_B; - kernel = internal::libxsmm_wrapper(0, mc, nc, kc, kernel_stride_A, kernel_stride_B, stride_C, 1, 1, blocking.prefetch()); - - // Outer blocking - for (Index ki_outer = 0; ki_outer < k; ki_outer += kc_outer) { - for (Index mi_outer = 0; mi_outer < m; mi_outer += mc_outer) { - for (Index ni_outer = 0; ni_outer < n; ni_outer += nc_outer) { - using numext::mini; - - Index actual_nc_outer = mini(ni_outer+nc_outer, n) - ni_outer; - - // Inner blocking - for (Index ki = ki_outer; ki < mini(ki_outer+kc_outer, k); ki += kc) { - const Index actual_kc = mini(ki_outer+kc_outer, mini(ki+kc, k)) - ki; - const float beta = ki == 0 ? 0 : 1; - - if (copyB) { - if (transposeB) { - libxsmm_otrans(panelB, rightData + ki*stride_B + ni_outer, sizeof(RhsScalar), actual_nc_outer, actual_kc, stride_B, stride_panelB); - } else { - internal::pack_simple(panelB, rightData + ni_outer*stride_B + ki, actual_nc_outer, actual_kc, stride_panelB, stride_B); - } - } - - for (Index mi = mi_outer; mi < mini(mi_outer+mc_outer, m); mi += mc) { - const Index actual_mc = mini(mi_outer+mc_outer, mini(mi+mc, m)) - mi; - - const LhsScalar* a = transposeA ? leftData + mi*stride_A + ki : - leftData + ki*stride_A + mi; - - if (copyA) { - if (transposeA) { - libxsmm_otrans(blockA, a, sizeof(LhsScalar), actual_kc, actual_mc, stride_A, stride_blockA); - } else { - internal::pack_simple(blockA, a, actual_kc, actual_mc, stride_blockA, stride_A); - } - } - const LhsScalar* actual_a = copyA ? blockA : a; - - for (Index ni = ni_outer; ni < mini(ni_outer+nc_outer, n); ni += nc) { - const Index actual_nc = mini(ni_outer+nc_outer, mini(ni+nc, n)) - ni; - - const RhsScalar* b = rightData + ni*stride_B + ki; - Scalar* c = buffer + ni*stride_C + mi; - const Scalar* cp = c + nc*stride_C; - - const RhsScalar* actual_b = copyB ? panelB + (ni-ni_outer)*stride_panelB : b; - const RhsScalar* bp = copyB ? panelB + nc*stride_panelB : b + nc*stride_B; - - if (actual_mc == mc && actual_kc == kc && actual_nc == nc && beta == 1) { - // Most used, cached kernel. - kernel(actual_a, actual_b, c, actual_a, bp, cp); - } else { - // Edges - use libxsmm kernel cache. - internal::libxsmm_wrapper(0, actual_mc, actual_nc, actual_kc, kernel_stride_A, kernel_stride_B, stride_C, 1, beta, blocking.prefetch())(actual_a, actual_b, c, actual_a, bp, cp); - } - } - } - } - } - } - } - - if (copyA) { - this->m_device.deallocate(blockA); - } - if (copyB) { - this->m_device.deallocate(panelB); - } - } -#endif - + protected: // Prevent assignment TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&); Dimensions m_dimensions; @@ -844,7 +564,6 @@ protected: TensorEvaluator m_rightImpl; const Device& m_device; Scalar* m_result; - bool m_can_use_xsmm; }; diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h index d34f9ca..5cf7b4f 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -50,140 +50,6 @@ class TensorContractionBlocking { }; - -#if defined(EIGEN_USE_LIBXSMM) -template -class TensorXsmmContractionBlocking { - public: - TensorXsmmContractionBlocking(Index k, Index m, Index n, - size_t max_num_threads = 1, bool transposeA = false, - bool transposeB = false): - k_(k), m_(m), n_(n), transposeA_(transposeA), - transposeB_(transposeB), num_threads_(max_num_threads) { -#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES - if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) { - mc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M; - kc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K; - nc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N; - outer_m_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_M; - outer_k_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_K; - outer_n_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_N; - copyA_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_A; - copyB_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_B; - outer_m_ = outer_m_ != 0 ? outer_m_ : m; - outer_k_ = outer_k_ != 0 ? outer_k_ : k; - outer_n_ = outer_n_ != 0 ? outer_n_ : n; - } -#else - // Defaults, possibly overriden per-platform. - copyA_ = true; - copyB_ = false; - - // If the matrix is small enough, don't do blocking, just call single xsmm - // kernel. - if (static_cast(m)*k*n <= LIBXSMM_THRESHOLD) { - mc_ = m; kc_ = k; nc_ = n; - outer_m_ = m; outer_k_ = k; outer_n_ = n; - copyA_ = false; copyB_ = false; - } else { - int arch = libxsmm_cpuid_x86(); - - if (arch == LIBXSMM_X86_AVX512_CORE) { - // skylake - mc_ = 64; kc_ = 64; nc_ = 24; - outer_m_ = 512; outer_k_ = 512; outer_n_ = 24*22; - // Hack to use this kernel architecture as the other one has performance - // issues (no hardware prefetching). - // TODO(nishantpatil): This should be removed if the issues are fixed, - // or this one becomes the default. - setenv("LIBXSMM_AVX512_CLASSIC_GEMM", "1", 1); - } else if (arch == LIBXSMM_X86_AVX2) { - // haswell - mc_ = 32; kc_ = 192; nc_ = 33; - outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 33*16; - } else if (arch == LIBXSMM_X86_AVX) { - // ivybridge - mc_ = 32; kc_ = 192; nc_ = 48; - outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 48*11; - } else { - // generic kernel size, usually performing well - mc_ = 32; kc_ = 128; nc_ = 32; - outer_m_ = 512; outer_k_ = 512; outer_n_ = 512; - } - - // Only copy if it makes the stride smaller. - copyA_ = copyA_ && (m > mc_); - copyB_ = copyB_ && (k > kc_); - } - - // We need to copy anyway if transposing - copyA_ = copyA_ || transposeA; - copyB_ = copyB_ || transposeB; - - // See libxsmm_gemm_prefetch_type definition in libxsmm_typedefs.h - prefetch_ = LIBXSMM_PREFETCH_AL2CL2BL2_VIA_C; - -#endif - - mc_ = mc_ > m ? m : mc_; - nc_ = nc_ > n ? n : nc_; - kc_ = kc_ > k ? k : kc_; - - size_t compute_parallelism = (m / mc_) * (n / nc_); - size_t pack_parallelism = 0; - if (copyA_) { - pack_parallelism += (m / mc_) * (k / kc_); - } - if (copyB_) { - pack_parallelism += (n / nc_) * (k / kc_); - } - size_t parallelism = numext::maxi(compute_parallelism, pack_parallelism); - - num_threads_ = numext::mini(num_threads_, - parallelism / MIN_JOBS_PER_THREAD); - num_threads_ = numext::maxi(num_threads_, 1); - - // For optimal performance outer block sizes should be multiplies of kernel - // sizes, or bigger than matrix size (=no outer blocking). - eigen_assert(outer_m_ % mc_ == 0 || outer_m_ >= m); - eigen_assert(outer_k_ % kc_ == 0 || outer_k_ >= k); - eigen_assert(outer_n_ % nc_ == 0 || outer_n_ >= n); - } - - EIGEN_ALWAYS_INLINE Index kc() const { return kc_; } - EIGEN_ALWAYS_INLINE Index mc() const { return mc_; } - EIGEN_ALWAYS_INLINE Index nc() const { return nc_; } - EIGEN_ALWAYS_INLINE Index outer_k() const { return outer_k_; } - EIGEN_ALWAYS_INLINE Index outer_m() const { return outer_m_; } - EIGEN_ALWAYS_INLINE Index outer_n() const { return outer_n_; } - EIGEN_ALWAYS_INLINE bool copyA() const { return copyA_; } - EIGEN_ALWAYS_INLINE bool copyB() const { return copyB_; } - EIGEN_ALWAYS_INLINE bool transposeA() const { return transposeA_; } - EIGEN_ALWAYS_INLINE bool transposeB() const { return transposeB_; } - EIGEN_ALWAYS_INLINE int num_threads() const { return num_threads_; } - EIGEN_ALWAYS_INLINE Index blocks_m() const { return divup(m_, mc_); } - EIGEN_ALWAYS_INLINE Index blocks_k() const { return divup(k_, kc_); } - EIGEN_ALWAYS_INLINE Index blocks_n() const { return divup(n_, nc_); } - EIGEN_ALWAYS_INLINE libxsmm_gemm_prefetch_type prefetch() const { - return prefetch_; - } - - private: - Index k_, m_, n_; - Index kc_, mc_, nc_; - Index outer_k_, outer_m_, outer_n_; - bool copyA_, copyB_, transposeA_, transposeB_; - size_t num_threads_; - - // Threshold for m*k*n to skip blocking and just call libxsmm - const double LIBXSMM_THRESHOLD = 80*80*80; - // For computing optimal number of threads - so that each thread gets at least - // that many jobs. - const double MIN_JOBS_PER_THREAD = 3; - libxsmm_gemm_prefetch_type prefetch_; -}; -#endif // EIGEN_USE_LIBXSMM - } // end namespace internal } // end namespace Eigen diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index c04b784..d65dbb4 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -529,6 +529,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh float2 rhs_shmem2[][8], const Index m_size, const Index n_size, const Index k_size, const Index base_m, const Index base_n) { + typedef float Scalar; // prefetch registers float4 lhs_pf0, rhs_pf0; @@ -539,27 +540,27 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh } -#define prefetch_lhs(reg, row, col) \ - if (!CHECK_LHS_BOUNDARY) { \ - if (col < k_size) { \ - reg =lhs.template loadPacket(row, col); \ - } \ - } else { \ - if (col < k_size) { \ - if (row + 3 < m_size) { \ - reg =lhs.template loadPacket(row, col); \ - } else if (row + 2 < m_size) { \ - reg.x =lhs(row + 0, col); \ - reg.y =lhs(row + 1, col); \ - reg.z =lhs(row + 2, col); \ - } else if (row + 1 < m_size) { \ - reg.x =lhs(row + 0, col); \ - reg.y =lhs(row + 1, col); \ - } else if (row < m_size) { \ - reg.x =lhs(row + 0, col); \ - } \ - } \ - } \ +#define prefetch_lhs(reg, row, col) \ + if (!CHECK_LHS_BOUNDARY) { \ + if (col < k_size) { \ + reg =lhs.loadPacket(row, col); \ + } \ + } else { \ + if (col < k_size) { \ + if (row + 3 < m_size) { \ + reg =lhs.loadPacket(row, col); \ + } else if (row + 2 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + reg.z =lhs(row + 2, col); \ + } else if (row + 1 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + } else if (row < m_size) { \ + reg.x =lhs(row + 0, col); \ + } \ + } \ + } \ Index lhs_vert = base_m+threadIdx.x*4; @@ -577,7 +578,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh if (!CHECK_RHS_BOUNDARY) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); } else if (rhs_vert + 2 < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -592,7 +593,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh } else { if (rhs_horiz0 < n_size) { if ((rhs_vert + 3) < k_size) { - rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); } else if ((rhs_vert + 2) < k_size) { rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); @@ -765,6 +766,7 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, float2 rhs_shmem2[][8], const Index m_size, const Index n_size, const Index k_size, const Index base_m, const Index base_n) { + typedef float Scalar; // prefetch registers float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3; @@ -788,37 +790,37 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, if (!CHECK_LHS_BOUNDARY) { if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - lhs_pf3 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24)); } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); } } else { // just CHECK_LHS_BOUNDARY if (lhs_vert + 3 < m_size) { if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - lhs_pf3 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24)); } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); } } else if (lhs_vert + 2 < m_size) { if ((threadIdx.y/4+k+24) < k_size) { @@ -907,8 +909,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, if (!CHECK_RHS_BOUNDARY) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); - rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1); } else if (rhs_vert + 2 < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -930,8 +932,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, if (rhs_horiz1 < n_size) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); - rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1); } else if (rhs_vert + 2 < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -952,7 +954,7 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, } else if (rhs_horiz0 < n_size) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); } else if ((rhs_vert + 2) < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -1135,6 +1137,9 @@ EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, typedef float2 LHS_MEM[64][32]; typedef float2 RHS_MEM[128][8]; + typedef float2 LHS_MEM16x16[32][16]; + typedef float2 RHS_MEM16x16[64][8]; + const Index m_block_idx = blockIdx.x; const Index n_block_idx = blockIdx.y; diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h index ab320a5..9b2cb3f 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -22,14 +22,8 @@ enum { /* * Implementation of the Eigen blas_data_mapper class for tensors. */ -/// The make pointer class is used by sycl in order to build the mapper class on the device. For other platform the default make pointer is used which -/// is scalar * for CoeffLoader. -template class MakePointer_ = MakePointer> struct CoeffLoader; -template class MakePointer_ = MakePointer> class BaseTensorContractionMapper; - -template class MakePointer_> struct CoeffLoader { + +template struct CoeffLoader { enum { DirectOffsets = false }; @@ -53,7 +47,7 @@ template class MakePointer const Tensor m_tensor; }; -template class MakePointer_> struct CoeffLoader { +template struct CoeffLoader { enum { DirectOffsets = true }; @@ -73,14 +67,13 @@ template class MakePointer_> struct CoeffLoad } private: typedef typename Tensor::Scalar Scalar; - - typename MakePointer_::Type m_data; + const Scalar* m_data; }; template class MakePointer_ = MakePointer> + int packet_size, bool inner_dim_contiguous, int Alignment> class SimpleTensorContractionMapper { public: EIGEN_DEVICE_FUNC @@ -96,7 +89,7 @@ class SimpleTensorContractionMapper { m_k_strides(k_strides) { } enum { - DirectOffsets = CoeffLoader::DirectOffsets + DirectOffsets = CoeffLoader::DirectOffsets }; EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { @@ -213,22 +206,23 @@ class SimpleTensorContractionMapper { } protected: - CoeffLoader m_tensor; + CoeffLoader m_tensor; const nocontract_t m_nocontract_strides; const nocontract_t m_ij_strides; const contract_t m_contract_strides; const contract_t m_k_strides; }; + template class MakePointer_> -class BaseTensorContractionMapper : public SimpleTensorContractionMapper + bool inner_dim_reordered, int Alignment> +class BaseTensorContractionMapper : public SimpleTensorContractionMapper { public: - typedef SimpleTensorContractionMapper ParentMapper; + typedef SimpleTensorContractionMapper ParentMapper; EIGEN_DEVICE_FUNC BaseTensorContractionMapper(const Tensor& tensor, @@ -241,9 +235,9 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper::half HalfPacket; - template + template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const { + EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { // whole method makes column major assumption // don't need to add offsets for now (because operator handles that) @@ -281,13 +275,7 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapperm_tensor.coeff(last); - return pload(data); - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { - return this->load(i,j); + return pload(data); } template @@ -313,11 +301,11 @@ template class MakePointer_> -class BaseTensorContractionMapper : public SimpleTensorContractionMapper + bool inner_dim_reordered, int Alignment> +class BaseTensorContractionMapper : public SimpleTensorContractionMapper { public: - typedef SimpleTensorContractionMapper ParentMapper; + typedef SimpleTensorContractionMapper ParentMapper; EIGEN_DEVICE_FUNC BaseTensorContractionMapper(const Tensor& tensor, @@ -334,12 +322,6 @@ class BaseTensorContractionMapperm_tensor.coeff(this->computeIndex(i, j)); return pload(data); } - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const { - EIGEN_ALIGN_MAX Scalar data[1]; - data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); - return pload(data); - } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const { return loadPacket(i, j); @@ -351,14 +333,14 @@ template class MakePointer_=MakePointer> + bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment> class TensorContractionSubMapper { public: typedef typename Tensor::PacketReturnType Packet; typedef typename unpacket_traits::half HalfPacket; - typedef BaseTensorContractionMapper ParentMapper; - typedef TensorContractionSubMapper Self; + typedef BaseTensorContractionMapper ParentMapper; + typedef TensorContractionSubMapper Self; typedef Self LinearMapper; enum { @@ -403,14 +385,6 @@ class TensorContractionSubMapper { return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); } - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const { - if (UseDirectOffsets) { - return m_base_mapper.template load(i, j); - } - return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); - } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { if (UseDirectOffsets) { return m_base_mapper.template loadHalfPacket(i, 0); @@ -418,7 +392,7 @@ class TensorContractionSubMapper { return m_base_mapper.template loadHalfPacket(i + m_vert_offset, m_horiz_offset); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet& p) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { if (UseDirectOffsets) { m_base_mapper.storePacket(i, 0, p); } @@ -458,14 +432,14 @@ template class MakePointer_=MakePointer> + bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment> class TensorContractionInputMapper - : public BaseTensorContractionMapper { + : public BaseTensorContractionMapper { public: typedef Scalar_ Scalar; - typedef BaseTensorContractionMapper Base; - typedef TensorContractionSubMapper SubMapper; + typedef BaseTensorContractionMapper Base; + typedef TensorContractionSubMapper SubMapper; typedef SubMapper VectorMapper; EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor, diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h deleted file mode 100644 index e87de0c..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ /dev/null @@ -1,400 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/***************************************************************** - * TensorSyclConvertToDeviceExpression.h - * - * \brief: - * TensorContractionsycl - * -*****************************************************************/ - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H -namespace Eigen { - -template struct LaunchSyclKernels; -template -struct TensorEvaluator, const Eigen::SyclDevice> : - public TensorContractionEvaluatorBase, const Eigen::SyclDevice> > { - - typedef const Eigen::SyclDevice Device; - - typedef TensorEvaluator, Device> Self; - typedef TensorContractionEvaluatorBase Base; - typedef TensorContractionOp XprType; - typedef typename internal::remove_const::type Scalar; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - - enum { - Layout = TensorEvaluator::Layout, - }; - - // Most of the code is assuming that both input tensors are ColMajor. If the - // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: - // If we want to compute A * B = C, where A is LHS and B is RHS, the code - // will pretend B is LHS and A is RHS. - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - - static const int LDims = - internal::array_size::Dimensions>::value; - static const int RDims = - internal::array_size::Dimensions>::value; - static const int ContractDims = internal::array_size::value; - - typedef array left_dim_mapper_t; - typedef array right_dim_mapper_t; - - typedef array contract_t; - typedef array left_nocontract_t; - typedef array right_nocontract_t; - - static const int NumDims = LDims + RDims - 2 * ContractDims; - - typedef DSizes Dimensions; - - // typedefs needed in evalTo - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; - - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; - - typedef typename LeftEvaluator::Dimensions LeftDimensions; - typedef typename RightEvaluator::Dimensions RightDimensions; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : - Base(op, device) {} - - // We need to redefine this method to make nvcc happy - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { - this->m_leftImpl.evalSubExprsIfNeeded(NULL); - this->m_rightImpl.evalSubExprsIfNeeded(NULL); - if (data) { - evalTo(data); - return false; - } else { - this->m_result = static_cast(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); - evalTo(this->m_result); - return true; - } - } - const Eigen::SyclDevice& device() const {return this->m_device;} - void evalTo(Scalar* buffer) const { - // Here is the result - if (this->m_lhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - } - else { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - } - } - - template - void evalTyped(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; - EIGEN_UNUSED_VARIABLE(k) - // rows in left side - const Index m = this->m_i_size; - // columns in right side - const Index n = this->m_j_size; - - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - LaunchSyclKernels::Run(*this, buffer, m, n, k, - this->m_k_strides, this->m_left_contracting_strides, this->m_right_contracting_strides, - this->m_i_strides, this->m_j_strides, this->m_left_nocontract_strides, this->m_right_nocontract_strides); - } - // required by sycl to construct the expr on the device. Returns original left_impl - const TensorEvaluator& left_impl() const { - return choose(Cond(Layout) == static_cast(ColMajor)>(), this->m_leftImpl, this->m_rightImpl); - } - // required by sycl to construct the expr on the device. Returns original right_impl - const TensorEvaluator& right_impl() const { - return choose(Cond(Layout) == static_cast(ColMajor)>(), this->m_rightImpl, this->m_leftImpl); - } -}; - -template struct KernelConstructor{ - typedef typename Eigen::internal::traits::_LhsNested LHSHostExpr; - typedef typename Eigen::internal::traits::_RhsNested RHSHostExpr; - typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression::Type LHSPlaceHolderExpr; - typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression::Type RHSPlaceHolderExpr; - LHSFunctorExpr lhs_functors; - RHSFunctorExpr rhs_functors; - LhsLocalAcc localLhs; - RhsLocalAcc localRhs; - OutAccessor out_res; - Index roundUpK, M, N, K; - ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides; - LeftNocontractT m_i_strides, m_left_nocontract_strides; - RightNocontractT m_j_strides, m_right_nocontract_strides; - LHSTupleType left_tuple_of_accessors; - RHSTupleType right_tuple_of_accessors; - Device dev; - - - KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_, - Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_, - ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_, - LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, LHSTupleType left_tuple_of_accessors_, RHSTupleType right_tuple_of_accessors_, Device dev_) - :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_), - m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_), - m_right_contracting_strides(m_right_contracting_strides_), - m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_), - m_j_strides(m_j_strides_), m_right_nocontract_strides(m_right_nocontract_strides_), - left_tuple_of_accessors(left_tuple_of_accessors_), right_tuple_of_accessors(right_tuple_of_accessors_), dev(dev_){} - - void operator()(cl::sycl::nd_item<1> itemID) { - typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; - typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression::Type LHSDevExpr; - typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression::Type RHSDevExpr; - auto lhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression(lhs_functors, left_tuple_of_accessors); - auto rhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression(rhs_functors, right_tuple_of_accessors); - typedef decltype(lhs_dev_expr.expr) LeftArgType; - typedef decltype(rhs_dev_expr.expr) RightArgType; - typedef typename internal::conditional(Eigen::internal::traits::Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional(Eigen::internal::traits::Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; - typedef internal::TensorContractionInputMapper LhsMapper; - - typedef internal::TensorContractionInputMapper RhsMapper; - // initialize data mappers must happen inside the kernel for device eval - LhsMapper lhs(LeftEvaluator(choose(Cond(Eigen::internal::traits::Layout) == static_cast(ColMajor)>(), - lhs_dev_expr.expr, rhs_dev_expr.expr), dev), m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides); - RhsMapper rhs(RightEvaluator(choose(Cond(Eigen::internal::traits::Layout) == static_cast(ColMajor)>(), - rhs_dev_expr.expr, lhs_dev_expr.expr),dev), m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides); - auto out_ptr = ConvertToActualTypeSycl(OutScalar, out_res); - // Matmul Kernel - // Thread identifiers - const Index mLocalThreadId = itemID.get_local(0); // Local ID row - const Index nLocalThreadId = itemID.get_local(1); // Local ID col - const Index mGroupId = itemID.get_group(0); // Work-group ID row - const Index nGroupId = itemID.get_group(1); // Work-group ID localCol - const Index linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID - // Allocate register space - float privateLhs; - float privateRhs[WorkLoadPerThreadN]; - float privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN]; - // Initialise the privateResumulation registers - for (Index wLPTM=0; wLPTM(0); - } - // Tile Rhs - for (Index lPTR=0; lPTR(0); - - } - // Loop over all tiles - const Index numTiles = roundUpK/TileSizeDimK; - Index firstHalf=0; - do { - // Synchronise - itemID.barrier(cl::sycl::access::fence_space::local_space); - // Load the next tile of Lhs and Rhs into local memory - Index nextHalf = firstHalf + 1; - if (nextHalf < numTiles) { - // Tile A - for (Index lPTL=0; lPTL(0); - } - // Tile B - for (Index lPTR=0; lPTR(0); - } - } - // Loop over the values of a single tile - for (Index k=0; k struct LaunchSyclKernels { - -static const Index TileSizeDimM = 32ul; // Tile size for dimension M -static const Index TileSizeDimN = 32ul; // Tile size for dimension N -static const Index TileSizeDimK = 16ul; // Tile size for dimension K -static const Index WorkLoadPerThreadM = 4ul; // Work load per thread in dimension M -static const Index WorkLoadPerThreadN = 4ul; // work load per thread in dimension N -static const Index LocalThreadSizeM = (TileSizeDimM/WorkLoadPerThreadM); // Local thread size for the first dimension (M here) -static const Index LocalThreadSizeN = (TileSizeDimN/WorkLoadPerThreadN); // Local thread size for the second dimension (N here) -static const Index LoadPerThreadLhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimN)); // workload per thread for Lhs expression -static const Index LoadPerThreadRhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimM)); // workload per thread for Rhs expression - -// RoundUp function to make sure that the global threadId is divisable by local threadId -static Index RoundUp(Index x, Index y) { - return ((((x) + (y) - 1) / (y))*(y)); -} - -template< typename Self, typename OutScalar, typename ContractT, typename LeftNocontractT, typename RightNocontractT> - static void Run(const Self& self, OutScalar* buffer, Index M, Index N, Index K, - ContractT m_k_strides, ContractT m_left_contracting_strides, ContractT m_right_contracting_strides, - LeftNocontractT m_i_strides, RightNocontractT m_j_strides, LeftNocontractT m_left_nocontract_strides, RightNocontractT m_right_nocontract_strides){ - - typedef typename Self::XprType HostExpr; - typedef typename Eigen::internal::traits::_LhsNested LHSHostExpr; - typedef typename Eigen::internal::traits::_RhsNested RHSHostExpr; - typedef TensorEvaluator OrigLHSExpr; - typedef TensorEvaluator OrigRHSExpr; - typedef Eigen::TensorSycl::internal::FunctorExtractor LHSFunctorExpr; - typedef Eigen::TensorSycl::internal::FunctorExtractor RHSFunctorExpr; - // extract lhs functor list - LHSFunctorExpr lhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl()); - // extract rhs functor list - RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl()); - - Index roundUpK = RoundUp(K, TileSizeDimK); - Index roundUpM = RoundUp(M, TileSizeDimM); - Index roundUpN = RoundUp(N, TileSizeDimN); - - self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) { - /// work-around for gcc bug - typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors(cgh, self.left_impl())) LHSTupleType; - /// work-around for gcc bug - typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors(cgh, self.right_impl())) RHSTupleType; - // create lhs tuple of accessors - LHSTupleType left_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors(cgh, self.left_impl()); - // create rhs tuple of accessors - RHSTupleType right_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors(cgh, self.right_impl()); - - // Local memory for elements of Lhs - typedef cl::sycl::accessor LhsLocalAcc; - LhsLocalAcc localLhs(cl::sycl::range<1>(2* TileSizeDimM * TileSizeDimK), cgh); - // Local memory for elements of Rhs - typedef cl::sycl::accessor RhsLocalAcc; - RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh); - - typedef cl::sycl::accessor OutAccessor; - //OutScalar memory - OutAccessor out_res= self.device(). template get_sycl_accessor(cgh, buffer); - - // sycl parallel for - cgh.parallel_for(cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN), - cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)), - KernelConstructor(lhs_functors, rhs_functors, - localLhs, localRhs, out_res, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides, - m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::DefaultDevice())); - }); - self.device().asynchronousExec(); - } -}; - -} // end namespace Eigen -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index d30cc96..ee16cde 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -116,28 +116,6 @@ struct TensorEvaluator void evalProduct(Scalar* buffer) const { - const Index m = this->m_i_size; - const Index n = this->m_j_size; - const Index k = this->m_k_size; - if (m == 0 || n == 0 || k == 0) return; - -#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM) - if (this->m_can_use_xsmm) { - bool transposeA = !this->m_lhs_inner_dim_contiguous; - bool transposeB = !this->m_rhs_inner_dim_contiguous; - internal::TensorXsmmContractionBlocking - blocking(k, m, n, this->m_device.numThreads(), transposeA, - transposeB); - - if (blocking.num_threads() == 1) { - this->evalGemmXSMM(buffer); - } else { - ContextXsmm(this, buffer, m, n, k, blocking).run(); - } - return; - } -#endif - typedef typename internal::remove_const::type LhsScalar; @@ -169,7 +147,10 @@ struct TensorEvaluator GebpKernel; - + const Index m = this->m_i_size; + const Index n = this->m_j_size; + const Index k = this->m_k_size; + if (m == 0 || n == 0 || k == 0) return; // Compute a set of algorithm parameters: // - kernel block sizes (bm, bn, bk) @@ -1063,187 +1044,6 @@ struct TensorEvaluator - class ContextXsmm { - public: - ContextXsmm(const Self* self, Scalar* buffer, Index m, Index n, Index k, - const internal::TensorXsmmContractionBlocking& blocking): - device(self->m_device), - m(m), k(k), n(n), - stride_a(blocking.transposeA() ? k : m), - stride_b(blocking.transposeB() ? n : k), - stride_c(m), - bm(blocking.mc()), bk(blocking.kc()), bn(blocking.nc()), - blocks_m(blocking.blocks_m()), blocks_k(blocking.blocks_k()), - blocks_n(blocking.blocks_n()), - copyA(blocking.copyA()), copyB(blocking.copyB()), - transposeA(blocking.transposeA()), transposeB(blocking.transposeB()), - num_threads(blocking.num_threads()), - buffer(buffer), - leftData(self->m_leftImpl.data()), rightData(self->m_rightImpl.data()), - workers_done(blocking.num_threads()), - - packingA_jobs(0), packingB_jobs(0), compute_jobs(0), - packingA_done(blocking.blocks_m()), packingB_done(blocking.blocks_n()) {} - - void worker() { - // Pack - - if (copyA) { - while (true) { - uint32_t mk = packingA_jobs++; - Index mi = mk / blocks_k; - Index ki = mk % blocks_k; - if (mi >= blocks_m) break; - - LhsScalar * blockA = blocksA + (bk*bm) * (mi*blocks_k+ki); - if (transposeA) { - const LhsScalar * current_a = leftData + (bm*mi)*stride_a + (bk*ki); - libxsmm_otrans(blockA, current_a, sizeof(LhsScalar), actual_bk(ki), - actual_bm(mi), stride_a, bm); - } else { - const LhsScalar * current_a = leftData + (bk*ki)*stride_a + (bm*mi); - internal::pack_simple(blockA, current_a, - actual_bk(ki), actual_bm(mi), bm, stride_a); - } - packingA_done.at(mi)++; - } - } - - if (copyB) { - while (true) { - uint32_t nk = packingB_jobs++; - Index ni = nk / blocks_k; - Index ki = nk % blocks_k; - if (ni >= blocks_n) break; - - RhsScalar * blockB = blocksB + (bk*bn) * (ni*blocks_k+ki); - if (transposeB) { - const RhsScalar * current_b = rightData + (ki*bk)*stride_b + - (ni*bn); - libxsmm_otrans(blockB, current_b, sizeof(RhsScalar), actual_bn(ni), - actual_bk(ki), stride_b, bk); - } else { - const RhsScalar * current_b = rightData + (ni*bn)*stride_b + - (ki*bk); - internal::pack_simple(blockB, current_b, - actual_bn(ni), actual_bk(ki), bk, stride_b); - } - packingB_done.at(ni)++; - } - } - - // Compute - - while (true) { - uint32_t mn = compute_jobs++; - Index mi = mn / blocks_n; - Index ni = mn % blocks_n; - if (mi >= blocks_m) break; - - // Wait for mi, ni packings to be done. This is more fine-grained than - // waiting for all workers to finish packing. - while ((copyA && (packingA_done.at(mi) < blocks_k)) || - (copyB && (packingB_done.at(ni) < blocks_k))) - {} - - for (Index ki=0; ki < blocks_k; ++ki) { - const LhsScalar * current_a = copyA ? - blocksA + (bk*bm) * (mi*blocks_k+ki) : - leftData + (bk*ki)*stride_a + (bm*mi); - const RhsScalar * current_b = copyB ? - blocksB + (bk*bn) * (ni*blocks_k+ki) : - rightData + (ni*bn)*stride_b + (bk*ki); - - Index current_stride_a = copyA ? bm : stride_a; - Index current_stride_b = copyB ? bk : stride_b; - - // Memory may not be zeroed, overwrite instead of adding in first - // iteration. - float beta = ki == 0 ? 0 : 1; - - Scalar * current_c = buffer + (mi*bm) + (ni*bn)*stride_c; - internal::libxsmm_wrapper( - 0, actual_bm(mi), actual_bn(ni), actual_bk(ki), - current_stride_a, current_stride_b, stride_c, 1, beta, 0) - (current_a, current_b, current_c); - } - } - - workers_done.Notify(); - } - - void run() { - // Parallelization strategy. - // - // First pack A into blocks (sharding by m, k) and B (sharding by n,k), - // then shard by m, n. - // - // Do not use advanced ThreadPool queuing, just run a single long-standing - // function in each thread. - if (copyA) { - blocksA = static_cast(device.allocate( - (blocks_m*bm)*(blocks_k*bk)*sizeof(LhsScalar))); - } - if (copyB) { - blocksB = static_cast(device.allocate( - (blocks_n*bn)*(blocks_k*bk)*sizeof(RhsScalar))); - } - - for (Index i = 0; i < num_threads; ++i) { - device.enqueueNoNotification([=]() { worker(); }); - } - - workers_done.Wait(); - - if (copyA) { - device.deallocate(blocksA); - } - if (copyB) { - device.deallocate(blocksB); - } - } - - private: - // real block size for block index in [0, ..., blocks - 1]. - Index actual_bm(Index mi) const { - return mi != blocks_m - 1 ? bm : m + bm - bm * blocks_m; - } - Index actual_bk(Index ki) const { - return ki != blocks_k - 1 ? bk : k + bk - bk * blocks_k; - } - Index actual_bn(Index ni) const { - return ni != blocks_n - 1 ? bn : n + bn - bn * blocks_n; - } - - const Device& device; - Index m, k, n; - Index stride_a, stride_b, stride_c; - Index bm, bk, bn; // Block sizes. - Index blocks_m, blocks_k, blocks_n; // Number of blocks in each dimension. - bool copyA, copyB, transposeA, transposeB; - Index num_threads; - Scalar *buffer; - const LhsScalar *leftData; - const RhsScalar *rightData; - - LhsScalar *blocksA; - RhsScalar *blocksB; - // barrier for joining all threads after all done. - Barrier workers_done; - // "queues" of (mi,ki), (ki,ni), (mi,ni) jobs packed [0,p)x[0,q) -> [0, p*q) - std::atomic packingA_jobs; - std::atomic packingB_jobs; - std::atomic compute_jobs; - // already packed blocks for each mi-panel in A and ni-panel in B. - std::vector> packingA_done; - std::vector> packingB_done; - }; -#endif - }; } // end namespace Eigen diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index b29968b..860a694 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -246,9 +246,6 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - /// required by sycl in order to extract the sycl accessor - const TensorEvaluator& impl() const { return m_impl; } - protected: template struct PacketConv { diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 378f5cc..abdf742 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -100,7 +100,7 @@ class IndexMapper { } } else { for (int i = NumDims - 1; i >= 0; --i) { - if (static_cast(i + 1) < offset) { + if (i + 1 < offset) { m_cudaInputStrides[i] = m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1]; m_cudaOutputStrides[i] = diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h deleted file mode 100644 index 4247c1c..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ /dev/null @@ -1,476 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: -// Copyright (C) 2016 Benoit Steiner - -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H - -namespace Eigen { - -/** \class TensorConvolution - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor convolution class. - * - * - */ -template -struct EigenConvolutionKernel1D{ -typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; -internal::IndexMapper::Layout> indexMapper; -Kernel_accessor kernel_filter; -const size_t kernelSize, range_x, range_y; -Buffer_accessor buffer_acc; -Local_accessor local_acc; -FunctorExpr functors; -TupleType tuple_of_accessors; -EigenConvolutionKernel1D(internal::IndexMapper::Layout> indexMapper_, - Kernel_accessor kernel_filter_, const size_t kernelSize_, const size_t range_x_, const size_t range_y_, - Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) - :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize(kernelSize_), range_x(range_x_), range_y(range_y_), - buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} - - void operator()(cl::sycl::nd_item<2> itemID) { - typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; - auto device_expr =TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator(device_expr.expr, Eigen::DefaultDevice()); - - auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); - auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); - - const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize -1); //the required row to be calculated for the for each plane in shered memory - const size_t plane_kernel_offset = itemID.get_local(1) * num_x_input; - const size_t first_input_start = itemID.get_group(0)*itemID.get_local_range()[0]; - const size_t plane_tensor_offset =indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(1)); - /// fill the shared memory - for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) { - const size_t local_index = i + plane_kernel_offset ; - const size_t tensor_index = plane_tensor_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_input_start); - if(((i + first_input_start) < (range_x +kernelSize-1)) && itemID.get_global(1)< range_y){ - local_acc[local_index] = device_evaluator.coeff(tensor_index); - } - else local_acc[local_index]=0.0f; - } - - itemID.barrier(cl::sycl::access::fence_space::local_space); - - // calculate the convolution - const size_t first_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x - if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y){ - CoeffReturnType result = static_cast(0); - const size_t index = plane_kernel_offset+ itemID.get_local(0); - for (size_t k = 0; k < kernelSize; ++k) { - result += (local_acc[k + index] * kernel_ptr[k]); - } - const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(1)) - +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + first_output_start); - buffer_ptr[tensor_index] = result; - } - } -}; - - -template -struct EigenConvolutionKernel2D{ -typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; -internal::IndexMapper::Layout> indexMapper; -Kernel_accessor kernel_filter; -const size_t kernelSize_x, kernelSize_y, range_x, range_y , range_z; -Buffer_accessor buffer_acc; -Local_accessor local_acc; -FunctorExpr functors; -TupleType tuple_of_accessors; -EigenConvolutionKernel2D(internal::IndexMapper::Layout> indexMapper_, - Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ ,const size_t range_x_, const size_t range_y_, const size_t range_z_, - Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) - :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), range_x(range_x_), range_y(range_y_), range_z(range_z_), - buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} - - void operator()(cl::sycl::nd_item<3> itemID) { - typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; - auto device_expr =TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator(device_expr.expr, Eigen::DefaultDevice()); - - auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); - auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); - const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory - const size_t num_y_input = (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory - const size_t plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(2)); - const size_t plane_kernel_offset = itemID.get_local(2) * num_y_input; - - /// fill the shared memory - const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0]; - const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1]; - for (size_t j = itemID.get_local(1); j < num_y_input; j += itemID.get_local_range()[1]) { - const size_t local_input_offset = num_x_input * (j + plane_kernel_offset); - for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) { - const size_t local_index = i + local_input_offset; - const size_t tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_x_input_start, j+ first_y_input_start ); - if(((i + first_x_input_start) < (range_x +kernelSize_x-1)) &&((j + first_y_input_start) < (range_y +kernelSize_y-1)) && itemID.get_global(2)< range_z){ - local_acc[local_index] = device_evaluator.coeff(tensor_index); - } - else local_acc[local_index]=0.0f; - } - } - - itemID.barrier(cl::sycl::access::fence_space::local_space); - - // calculate the convolution - const size_t fitst_x_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x - const size_t fitst_y_output_start =itemID.get_group(1)*(itemID.get_local_range()[1]); // output start y - if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y && itemID.get_global(2)< range_z){ - CoeffReturnType result = static_cast(0); - for (size_t j = 0; j < kernelSize_y; j++) { - size_t kernel_offset =kernelSize_x * j; - const size_t index = (num_x_input*(plane_kernel_offset + j+ itemID.get_local(1))) + itemID.get_local(0); - for (size_t i = 0; i < kernelSize_x; i++) { - result += (local_acc[i + index] * kernel_ptr[i+kernel_offset]); - } - } - const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(2)) - +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start); - buffer_ptr[tensor_index] = result; - } - } -}; - - - -template -struct EigenConvolutionKernel3D{ -typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; -internal::IndexMapper::Layout> indexMapper; -Kernel_accessor kernel_filter; -const size_t kernelSize_x, kernelSize_y, kernelSize_z, range_x, range_y , range_z, numP; -Buffer_accessor buffer_acc; -Local_accessor local_acc; -FunctorExpr functors; -TupleType tuple_of_accessors; -EigenConvolutionKernel3D(internal::IndexMapper::Layout> indexMapper_, - Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ , const size_t kernelSize_z_ , - const size_t range_x_, const size_t range_y_, const size_t range_z_, const size_t numP_, - Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_) - :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), - kernelSize_z(kernelSize_z_), range_x(range_x_), range_y(range_y_), range_z(range_z_), numP(numP_), - buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {} - - void operator()(cl::sycl::nd_item<3> itemID) { - typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; - auto device_expr =TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); - auto device_evaluator = Eigen::TensorEvaluator(device_expr.expr, Eigen::DefaultDevice()); - - auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc); - auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter); - const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory - const size_t num_y_input = (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory - const size_t num_z_input = (itemID.get_local_range()[2] +kernelSize_z -1); //the required row to be calculated for the for each plane in shered memory - const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0]; - const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1]; - const size_t first_z_input_start = itemID.get_group(2)*itemID.get_local_range()[2]; - for(size_t p=0; p(0); - for (size_t k = 0; k < kernelSize_z; k++) { - for (size_t j = 0; j < kernelSize_y; j++) { - for (size_t i = 0; i < kernelSize_x; i++) { - const size_t kernel_index =i + kernelSize_x * (j + kernelSize_y * k); - const size_t local_index = ((i+ itemID.get_local(0))+ num_x_input*((j+ itemID.get_local(1)) + num_y_input * (k+ itemID.get_local(2)))); - result += (local_acc[local_index] * kernel_ptr[kernel_index]); - } - } - } - const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p) - +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start, itemID.get_local(2) + fitst_z_output_start ); - buffer_ptr[tensor_index] = result; - } - - itemID.barrier(cl::sycl::access::fence_space::local_space); - } - } -}; - - -template -struct TensorEvaluator, const Eigen::SyclDevice> -{ - typedef TensorConvolutionOp XprType; - - static const int NumDims = internal::array_size::Dimensions>::value; - static const int NumKernelDims = internal::array_size::value; - typedef typename XprType::Index Index; - typedef DSizes Dimensions; - typedef typename TensorEvaluator::Dimensions KernelDimensions; - typedef const Eigen::SyclDevice Device; - - enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Eigen::SyclDevice& device) - : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) - { - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); - - const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); - const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); - - m_dimensions = m_inputImpl.dimensions(); - for (int i = 0; i < NumKernelDims; ++i) { - const Index index = op.indices()[i]; - const Index input_dim = input_dims[index]; - const Index kernel_dim = kernel_dims[i]; - const Index result_dim = input_dim - kernel_dim + 1; - m_dimensions[index] = result_dim; - } - } - - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef typename InputArgType::Scalar Scalar; - static const int PacketSize = internal::unpacket_traits::size; - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { - preloadKernel(); - m_inputImpl.evalSubExprsIfNeeded(NULL); - if (data) { - executeEval(data); - return false; - } else { - m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)); - executeEval(m_buf); - return true; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_inputImpl.cleanup(); - if (m_buf) { - m_device.deallocate(m_buf); - m_buf = NULL; - } - if (m_local_kernel) { - m_device.deallocate((void*)m_kernel); - m_local_kernel = false; - } - m_kernel = NULL; - } - /// used by sycl in order to build the sycl buffer - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;} - /// used by sycl in order to build the sycl buffer - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return m_buf; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { - // Don't make a local copy of the kernel unless we have to (i.e. it's an - // expression that needs to be evaluated) - const Scalar* in_place = m_kernelImpl.data(); - if (in_place) { - m_kernel = in_place; - m_local_kernel = false; - } else { - size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); - Scalar* local = (Scalar*)m_device.allocate(kernel_sz); - typedef TensorEvalToOp EvalTo; - EvalTo evalToTmp(local, m_kernelArg); - const bool PacketAccess = internal::IsVectorizable::value; - internal::TensorExecutor::run(evalToTmp, m_device); - m_kernel = local; - m_local_kernel = true; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(Scalar* data) const { - typedef TensorEvaluator InputEvaluator; - typedef typename InputEvaluator::Dimensions InputDims; - - typedef Eigen::TensorSycl::internal::FunctorExtractor InputFunctorExpr; - // extract input functor list - InputFunctorExpr input_functors = Eigen::TensorSycl::internal::extractFunctors(m_inputImpl); - - - m_device.sycl_queue().submit([&](cl::sycl::handler &cgh) { - - typedef cl::sycl::accessor InputLocalAcc; - /// work-around for gcc 4.8 auto bug - typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors(cgh, m_inputImpl)) InputTupleType; - // create input tuple of accessors - InputTupleType tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors(cgh, m_inputImpl); - - typedef cl::sycl::accessor OutputAccessorType; - OutputAccessorType out_res= m_device. template get_sycl_accessor(cgh, data); - typedef cl::sycl::accessor KernelAccessorType; - KernelAccessorType kernel_acc= m_device. template get_sycl_accessor(cgh, m_kernel); - - switch (NumKernelDims) { - case 1: { - const size_t numX = dimensions()[m_indices[0]]; - const size_t numP = dimensions().TotalSize() / numX; - const size_t kernel_size = m_kernelImpl.dimensions().TotalSize(); - size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y; - m_device.parallel_for_setup(numX, numP, tileSize_x,tileSize_y,range_x,range_y, GRange_x, GRange_y ); - const size_t shared_mem =(tileSize_x +kernel_size -1)*(tileSize_y); - assert(static_cast(shared_mem) <= m_device.sharedMemPerBlock()); - auto global_range=cl::sycl::range<2>(GRange_x, GRange_y); // global range - auto local_range=cl::sycl::range<2>(tileSize_x, tileSize_y); // local range - InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh); - const array indices{{m_indices[0]}}; - const array kernel_dims{{m_kernelImpl.dimensions()[0]}}; - internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); - cgh.parallel_for(cl::sycl::nd_range<2>(global_range, local_range), - EigenConvolutionKernel1D( - indexMapper,kernel_acc, kernel_size, numX, numP, out_res, local_acc, input_functors, tuple_of_accessors)); - break; - } - - case 2: { - const size_t idxX =static_cast(Layout) == static_cast(ColMajor) ? 0 : 1; - const size_t idxY =static_cast(Layout) == static_cast(ColMajor) ? 1 : 0; - const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX]; - const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY]; - const size_t numX = dimensions()[m_indices[idxX]]; - const size_t numY = dimensions()[m_indices[idxY]]; - const size_t numP = dimensions().TotalSize() / (numX*numY); - size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z; - m_device.parallel_for_setup(numX, numY, numP, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z ); - const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * tileSize_z; - assert(static_cast(shared_mem) <= m_device.sharedMemPerBlock()); - auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range - auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range - InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh); - const array indices {{m_indices[idxX], m_indices[idxY]}}; - const array kernel_dims{{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY]}}; - internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); - cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range), - EigenConvolutionKernel2D( - indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, local_acc, input_functors, tuple_of_accessors)); - break; - } - - case 3: { - const size_t idxX =static_cast(Layout) == static_cast(ColMajor) ? 0 : 2; - const size_t idxY =static_cast(Layout) == static_cast(ColMajor) ? 1 : 1; - const size_t idxZ =static_cast(Layout) == static_cast(ColMajor) ? 2 : 0; - const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX]; - const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY]; - const size_t kernel_size_z = m_kernelImpl.dimensions()[idxZ]; - const size_t numX = dimensions()[m_indices[idxX]]; - const size_t numY = dimensions()[m_indices[idxY]]; - const size_t numZ = dimensions()[m_indices[idxZ]]; - const size_t numP = dimensions().TotalSize() / (numX*numY*numZ); - const array indices{{m_indices[idxX], m_indices[idxY], m_indices[idxZ]}}; - const array kernel_dims{{m_kernelImpl.dimensions()[idxX],m_kernelImpl.dimensions()[idxY], m_kernelImpl.dimensions()[idxZ]}}; - internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); - size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z; - m_device.parallel_for_setup(numX, numY, numZ, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z ); - const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * (tileSize_z +kernel_size_y -1); - assert(static_cast(shared_mem) <= m_device.sharedMemPerBlock()); - auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range - auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range - InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh); - cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range), - EigenConvolutionKernel3D( - indexMapper,kernel_acc, kernel_size_x, kernel_size_y, kernel_size_z, numX, numY, - numZ, numP, out_res, local_acc, input_functors, tuple_of_accessors)); - break; - } - - default: { - EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); - } - } - }); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - eigen_assert(m_buf); - eigen_assert(index < m_dimensions.TotalSize()); - return m_buf[index]; - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const - { - eigen_assert(m_buf); - eigen_assert(index < m_dimensions.TotalSize()); - return internal::ploadt(m_buf+index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost - // model. - const double kernel_size = m_kernelImpl.dimensions().TotalSize(); - // We ignore the use of fused multiply-add. - const double convolve_compute_cost = - TensorOpCost::AddCost() + TensorOpCost::MulCost(); - const double firstIndex_compute_cost = - NumDims * - (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + - TensorOpCost::DivCost()); - return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + - kernel_size * (m_inputImpl.costPerCoeff(vectorized) + - m_kernelImpl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, convolve_compute_cost, vectorized, - PacketSize)); - } - - private: - // No assignment (copies are needed by the kernels) - TensorEvaluator& operator = (const TensorEvaluator&); - TensorEvaluator m_inputImpl; - KernelArgType m_kernelArg; - TensorEvaluator m_kernelImpl; - Indices m_indices; - Dimensions m_dimensions; - Scalar* m_buf; - const Scalar* m_kernel; - bool m_local_kernel; - const Eigen::SyclDevice& m_device; -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index be8d693..4f5767b 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -88,7 +88,7 @@ static void initializeDeviceProp() { #if __cplusplus >= 201103L std::atomic_thread_fence(std::memory_order_acquire); #endif - EIGEN_SLEEP(1000); + sleep(1); } } } @@ -217,10 +217,7 @@ struct GpuDevice { EIGEN_UNUSED_VARIABLE(err) assert(err == cudaSuccess); #else - EIGEN_UNUSED_VARIABLE(dst); - EIGEN_UNUSED_VARIABLE(src); - EIGEN_UNUSED_VARIABLE(n); - eigen_assert(false && "The default device should be used instead to generate kernel code"); + eigen_assert(false && "The default device should be used instead to generate kernel code"); #endif } diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h index ccaaa6c..9d14139 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h @@ -45,7 +45,7 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { -#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__) +#ifndef __CUDA_ARCH__ // Running on the host CPU return l1CacheSize(); #else @@ -55,7 +55,7 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { -#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__) +#ifndef __CUDA_ARCH__ // Running single threaded on the host CPU return l3CacheSize(); #else diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index e209799..7c03989 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -16,400 +16,107 @@ #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H namespace Eigen { - - #define ConvertToActualTypeSycl(Scalar, buf_acc) reinterpret_cast::pointer_t>((&(*buf_acc.get_pointer()))) - - template class MemCopyFunctor { - public: - MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset) : m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {} - - void operator()(cl::sycl::nd_item<1> itemID) { - auto src_ptr = ConvertToActualTypeSycl(Scalar, m_src_acc); - auto dst_ptr = ConvertToActualTypeSycl(Scalar, m_dst_acc); - auto globalid = itemID.get_global_linear_id(); - if (globalid < m_rng) { - dst_ptr[globalid + m_i] = src_ptr[globalid + m_offset]; - } - } - - private: - read_accessor m_src_acc; - write_accessor m_dst_acc; - size_t m_rng; - size_t m_i; - size_t m_offset; - }; - - struct memsetkernelFunctor{ - typedef cl::sycl::accessor AccType; - AccType m_acc; - const size_t m_rng, m_c; - memsetkernelFunctor(AccType acc, const size_t rng, const size_t c):m_acc(acc), m_rng(rng), m_c(c){} - void operator()(cl::sycl::nd_item<1> itemID) { - auto globalid=itemID.get_global_linear_id(); - if (globalid< m_rng) m_acc[globalid] = m_c; - } - - }; - -EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){ - auto devices = cl::sycl::device::get_devices(); - std::vector::iterator it =devices.begin(); - while(it!=devices.end()) { - /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU ) - auto s= (*it).template get_info(); - std::transform(s.begin(), s.end(), s.begin(), ::tolower); - if((*it).is_cpu() && s.find("amd")!=std::string::npos && s.find("apu") == std::string::npos){ // remove amd cpu as it is not supported by computecpp allow APUs - it=devices.erase(it); - } - else{ - ++it; - } - } - return devices; -} - -struct QueueInterface { - /// class members: - bool exception_caught_ = false; - - mutable std::mutex mutex_; - +struct SyclDevice { + /// class members + /// sycl queue + mutable cl::sycl::queue m_queue; /// std::map is the container used to make sure that we create only one buffer /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. - mutable std::map> buffer_map; - /// sycl queue - mutable cl::sycl::queue m_queue; - /// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured through dev_Selector typename - /// SyclStreamDevice is not owned. it is the caller's responsibility to destroy it. - template explicit QueueInterface(const dev_Selector& s): + mutable std::map> buffer_map; + /// creating device by using selector + template SyclDevice(dev_Selector s) + : #ifdef EIGEN_EXCEPTIONS - m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { + m_queue(cl::sycl::queue(s, [=](cl::sycl::exception_list l) { for (const auto& e : l) { try { - if (e) { - exception_caught_ = true; - std::rethrow_exception(e); - } + std::rethrow_exception(e); } catch (cl::sycl::exception e) { - std::cerr << e.what() << std::endl; - } + std::cout << e.what() << std::endl; + } } })) #else -m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) { - for (const auto& e : l) { - if (e) { - exception_caught_ = true; - std::cerr << "Error detected Inside Sycl Device."<< std::endl; - - } - } -})) + m_queue(cl::sycl::queue(s)) #endif {} + // destructor + ~SyclDevice() { deallocate_all(); } - /// Allocating device pointer. This pointer is actually an 8 bytes host pointer used as key to access the sycl device buffer. - /// The reason is that we cannot use device buffer as a pointer as a m_data in Eigen leafNode expressions. So we create a key - /// pointer to be used in Eigen expression construction. When we convert the Eigen construction into the sycl construction we - /// use this pointer as a key in our buffer_map and we make sure that we dedicate only one buffer only for this pointer. - /// The device pointer would be deleted by calling deallocate function. - EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { - auto buf = cl::sycl::buffer(cl::sycl::range<1>(num_bytes)); - auto ptr =buf.get_access().get_pointer(); - buf.set_final_data(nullptr); - std::lock_guard lock(mutex_); - buffer_map.insert(std::pair>(static_cast(ptr),buf)); - return static_cast(ptr); - } - - /// This is used to deallocate the device pointer. p is used as a key inside - /// the map to find the device buffer and delete it. - EIGEN_STRONG_INLINE void deallocate(void *p) const { - std::lock_guard lock(mutex_); - auto it = buffer_map.find(static_cast(p)); + template void deallocate(T *p) const { + auto it = buffer_map.find(p); if (it != buffer_map.end()) { buffer_map.erase(it); + internal::aligned_free(p); } } - - EIGEN_STRONG_INLINE void deallocate_all() const { - std::lock_guard lock(mutex_); - buffer_map.clear(); - } - - EIGEN_STRONG_INLINE std::map>::iterator find_buffer(const void* ptr) const { - std::lock_guard lock(mutex_); - auto it1 = buffer_map.find(static_cast(ptr)); - if (it1 != buffer_map.end()){ - return it1; - } - else{ - for(std::map>::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){ - auto size = it->second.get_size(); - if((it->first < (static_cast(ptr))) && ((static_cast(ptr)) < (it->first + size)) ) return it; - } - } - std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl; - abort(); - } - - // This function checks if the runtime recorded an error for the - // underlying stream device. - EIGEN_STRONG_INLINE bool ok() const { - if (!exception_caught_) { - m_queue.wait_and_throw(); + void deallocate_all() const { + std::map>::iterator it=buffer_map.begin(); + while (it!=buffer_map.end()) { + auto p=it->first; + buffer_map.erase(it); + internal::aligned_free(const_cast(p)); + it=buffer_map.begin(); } - return !exception_caught_; + buffer_map.clear(); } - // destructor - ~QueueInterface() { buffer_map.clear(); } -}; - -struct SyclDevice { - // class member. - QueueInterface* m_queue_stream; - /// QueueInterface is not owned. it is the caller's responsibility to destroy it. - explicit SyclDevice(QueueInterface* queue_stream) : m_queue_stream(queue_stream){} - - /// Creation of sycl accessor for a buffer. This function first tries to find + /// creation of sycl accessor for a buffer. This function first tries to find /// the buffer in the buffer_map. If found it gets the accessor from it, if not, - /// the function then adds an entry by creating a sycl buffer for that particular pointer. - template EIGEN_STRONG_INLINE cl::sycl::accessor - get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const { - return (get_sycl_buffer(ptr).template get_access(cgh)); + ///the function then adds an entry by creating a sycl buffer for that particular pointer. + template inline cl::sycl::accessor + get_sycl_accessor(size_t num_bytes, cl::sycl::handler &cgh, const T * ptr) const { + return (get_sycl_buffer(num_bytes, ptr)->template get_access(cgh)); } - /// Accessing the created sycl device buffer for the device pointer - EIGEN_STRONG_INLINE cl::sycl::buffer& get_sycl_buffer(const void * ptr) const { - return m_queue_stream->find_buffer(ptr)->second; + template inline std::pair>::iterator,bool> add_sycl_buffer(const T *ptr, size_t num_bytes) const { + using Type = cl::sycl::buffer; + std::pair>::iterator,bool> ret = buffer_map.insert(std::pair>(ptr, std::shared_ptr(new Type(cl::sycl::range<1>(num_bytes)), + [](void *dataMem) { delete static_cast(dataMem); }))); + (static_cast(buffer_map.at(ptr).get()))->set_final_data(nullptr); + return ret; } - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels - template - EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const { - tileSize =static_cast(sycl_queue().get_device(). template get_info()); - auto s= sycl_queue().get_device().template get_info(); - std::transform(s.begin(), s.end(), s.begin(), ::tolower); - if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - tileSize=std::min(static_cast(256), static_cast(tileSize)); - } - rng = n; - if (rng==0) rng=static_cast(1); - GRange=rng; - if (tileSize>GRange) tileSize=GRange; - else if(GRange>tileSize){ - Index xMode = static_cast(GRange % tileSize); - if (xMode != 0) GRange += static_cast(tileSize - xMode); - } - } - - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels - template - EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const { - Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); - if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); - } - Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); - tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); - rng1=dim1; - if (rng1==0 ) rng1=static_cast(1); - GRange1=rng1; - if (tileSize1>GRange1) tileSize1=GRange1; - else if(GRange1>tileSize1){ - Index xMode = static_cast(GRange1 % tileSize1); - if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); - } - tileSize0 = static_cast(max_workgroup_Size/tileSize1); - rng0 = dim0; - if (rng0==0 ) rng0=static_cast(1); - GRange0=rng0; - if (tileSize0>GRange0) tileSize0=GRange0; - else if(GRange0>tileSize0){ - Index xMode = static_cast(GRange0 % tileSize0); - if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); - } + template inline cl::sycl::buffer* get_sycl_buffer(size_t num_bytes,const T * ptr) const { + return static_cast*>(add_sycl_buffer(ptr, num_bytes).first->second.get()); } - - - /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels - template - EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const { - Index max_workgroup_Size = static_cast(maxSyclThreadsPerBlock()); - if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size - max_workgroup_Size=std::min(static_cast(256), static_cast(max_workgroup_Size)); - } - Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); - tileSize2 =static_cast(std::pow(2, static_cast(pow_of_2/3))); - rng2=dim2; - if (rng2==0 ) rng1=static_cast(1); - GRange2=rng2; - if (tileSize2>GRange2) tileSize2=GRange2; - else if(GRange2>tileSize2){ - Index xMode = static_cast(GRange2 % tileSize2); - if (xMode != 0) GRange2 += static_cast(tileSize2 - xMode); - } - pow_of_2 = static_cast(std::log2(static_cast(max_workgroup_Size/tileSize2))); - tileSize1 =static_cast(std::pow(2, static_cast(pow_of_2/2))); - rng1=dim1; - if (rng1==0 ) rng1=static_cast(1); - GRange1=rng1; - if (tileSize1>GRange1) tileSize1=GRange1; - else if(GRange1>tileSize1){ - Index xMode = static_cast(GRange1 % tileSize1); - if (xMode != 0) GRange1 += static_cast(tileSize1 - xMode); - } - tileSize0 = static_cast(max_workgroup_Size/(tileSize1*tileSize2)); - rng0 = dim0; - if (rng0==0 ) rng0=static_cast(1); - GRange0=rng0; - if (tileSize0>GRange0) tileSize0=GRange0; - else if(GRange0>tileSize0){ - Index xMode = static_cast(GRange0 % tileSize0); - if (xMode != 0) GRange0 += static_cast(tileSize0 - xMode); - } - } - /// allocate device memory - EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { - return m_queue_stream->allocate(num_bytes); + /// allocating memory on the cpu + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void *allocate(size_t) const { + return internal::aligned_malloc(8); } - /// deallocate device memory - EIGEN_STRONG_INLINE void deallocate(void *p) const { - m_queue_stream->deallocate(p); - } // some runtime conditions that can be applied here - EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; } + bool isDeviceSuitable() const { return true; } - /// the memcpy function - template EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { - auto it1 = m_queue_stream->find_buffer(static_cast(src)); - auto it2 = m_queue_stream->find_buffer(dst); - auto offset= (static_cast(static_cast(src))) - it1->first; - auto i= (static_cast(dst)) - it2->first; - offset/=sizeof(Index); - i/=sizeof(Index); - size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); - sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto src_acc =it1->second.template get_access(cgh); - auto dst_acc =it2->second.template get_access(cgh); - typedef decltype(src_acc) read_accessor; - typedef decltype(dst_acc) write_accessor; - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, i, offset)); - }); - synchronize(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const { + ::memcpy(dst, src, n); } - /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device - /// pointer created as a key we find the sycl buffer and get the host accessor with discard_write mode - /// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the - /// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that - /// this buffer is accessed, the data will be copied to the device. - template EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const { - auto host_acc= get_sycl_buffer(dst). template get_access(); - ::memcpy(host_acc.get_pointer(), src, n); + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(T *dst, const T *src, size_t n) const { + auto host_acc= (static_cast*>(add_sycl_buffer(dst, n).first->second.get()))-> template get_access(); + memcpy(host_acc.get_pointer(), src, n); } - /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl - /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the - /// lifespan of the memcpyDeviceToHost function. We create a kernel to copy the data, from the device- only source buffer to the destination - /// buffer with map_allocator on the gpu in parallel. At the end of the function call the destination buffer would be destroyed and the data - /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back - /// to the cpu only once per function call. - template EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const { - auto it = m_queue_stream->find_buffer(src); - auto offset =static_cast(static_cast(src))- it->first; - offset/=sizeof(Index); - size_t rng, GRange, tileSize; - parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange); - // Assuming that the dst is the start of the destination pointer - auto dest_buf = cl::sycl::buffer >(static_cast(dst), cl::sycl::range<1>(n)); - sycl_queue().submit([&](cl::sycl::handler &cgh) { - auto src_acc= it->second.template get_access(cgh); - auto dst_acc =dest_buf.template get_access(cgh); - typedef decltype(src_acc) read_accessor; - typedef decltype(dst_acc) write_accessor; - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, 0, offset)); - }); - synchronize(); - } - /// returning the sycl queue - EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;} - /// Here is the implementation of memset function on sycl. - EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { - size_t rng, GRange, tileSize; - parallel_for_setup(n, tileSize, rng, GRange); - sycl_queue().submit(memsetCghFunctor(get_sycl_buffer(static_cast(static_cast(data))),rng, GRange, tileSize, c )); - synchronize(); - } - - struct memsetCghFunctor{ - cl::sycl::buffer& m_buf; - const size_t& rng , GRange, tileSize; - const int &c; - memsetCghFunctor(cl::sycl::buffer& buff, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_) - :m_buf(buff), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){} - - void operator()(cl::sycl::handler &cgh) const { - auto buf_acc = m_buf.template get_access(cgh); - cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, rng, c)); + /// whith the current implementation of sycl, the data is copied twice from device to host. This will be fixed soon. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(T *dst, const T *src, size_t n) const { + auto it = buffer_map.find(src); + if (it != buffer_map.end()) { + auto host_acc= (static_cast*>(it->second.get()))-> template get_access(); + memcpy(dst,host_acc.get_pointer(), n); + } else{ + eigen_assert("no device memory found. The memory might be destroyed before creation"); } - }; - - EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { - // FIXME - return 48*1024; - } - - EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { - // We won't try to take advantage of the l2 cache for the time being, and - // there is no l3 cache on cuda devices. - return firstLevelCacheSize(); - } - EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { - return sycl_queue().get_device(). template get_info(); - // return stream_->deviceProperties().multiProcessorCount; } - EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { - return sycl_queue().get_device(). template get_info(); - // return stream_->deviceProperties().maxThreadsPerBlock; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void *buffer, int c, size_t n) const { + ::memset(buffer, c, n); } - EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { - // OpenCL doesnot have such concept - return 2;//sycl_queue().get_device(). template get_info(); - // return stream_->deviceProperties().maxThreadsPerMultiProcessor; - } - EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { - return sycl_queue().get_device(). template get_info(); - // return stream_->deviceProperties().sharedMemPerBlock; - } - /// No need for sycl it should act the same as CPU version - EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } - - EIGEN_STRONG_INLINE void synchronize() const { - sycl_queue().wait_and_throw(); //pass - } - - EIGEN_STRONG_INLINE void asynchronousExec() const { - ///FIXEDME:: currently there is a race condition regarding the asynch scheduler. - //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled - sycl_queue().wait_and_throw(); //pass - - } - // This function checks if the runtime recorded an error for the - // underlying stream device. - EIGEN_STRONG_INLINE bool ok() const { - return m_queue_stream->ok(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { + return 1; } }; - - } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index 16180ca..069680a 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -12,6 +12,17 @@ namespace Eigen { +// Use the SimpleThreadPool by default. We'll switch to the new non blocking +// thread pool later. +#ifndef EIGEN_USE_SIMPLE_THREAD_POOL +template using ThreadPoolTempl = NonBlockingThreadPoolTempl; +typedef NonBlockingThreadPool ThreadPool; +#else +template using ThreadPoolTempl = SimpleThreadPoolTempl; +typedef SimpleThreadPool ThreadPool; +#endif + + // Barrier is an object that allows one or more threads to wait until // Notify has been called a specified number of times. class Barrier { @@ -245,7 +256,7 @@ struct ThreadPoolDevice { // Split into halves and submit to the pool. Index mid = first + divup((last - first) / 2, block_size) * block_size; pool_->Schedule([=, &handleRange]() { handleRange(mid, last); }); - handleRange(first, mid); + pool_->Schedule([=, &handleRange]() { handleRange(first, mid); }); }; handleRange(0, n); barrier.Wait(); diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 86405e6..b24cdeb 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -33,7 +33,7 @@ namespace Eigen { namespace internal { template struct dget { - static const std::ptrdiff_t value = get::value; + static const std::size_t value = get::value; }; @@ -90,11 +90,9 @@ struct fixed_size_tensor_index_extraction_helper // Fixed size #ifndef EIGEN_EMULATE_CXX11_META_H template -struct Sizes { +struct Sizes : internal::numeric_list { typedef internal::numeric_list Base; - const Base t = Base(); static const std::ptrdiff_t total_size = internal::arg_prod(Indices...); - static const size_t count = Base::count; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const { return Base::count; @@ -122,16 +120,16 @@ struct Sizes { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::size_t index) const { - return internal::fixed_size_tensor_index_extraction_helper::run(index, t); + return internal::fixed_size_tensor_index_extraction_helper::run(index, *this); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfColMajor(const array& indices) const { - return internal::fixed_size_tensor_index_linearization_helper::run(indices, t); + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t IndexOfRowMajor(const array& indices) const { - return internal::fixed_size_tensor_index_linearization_helper::run(indices, t); + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); } }; diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 82dd1e6..0698713 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -41,9 +41,6 @@ struct traits > // Intermediate typedef to workaround MSVC issue. typedef MakePointer_ MakePointerT; typedef typename MakePointerT::Type Type; - typedef typename MakePointerT::RefType RefType; - - }; }; @@ -120,7 +117,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& op() const { return m_op; } - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() { } diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index d641581..834ce07 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -32,7 +32,6 @@ struct TensorEvaluator typedef typename Derived::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; typedef typename Derived::Dimensions Dimensions; - typedef Derived XprType; // NumDimensions is -1 for variable dim tensors static const int NumCoords = internal::traits::NumDimensions > 0 ? @@ -69,9 +68,7 @@ struct TensorEvaluator return m_data[index]; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename internal::traits::template MakePointer::RefType - coeffRef(Index index) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { eigen_assert(m_data); return m_data[index]; } @@ -97,9 +94,7 @@ struct TensorEvaluator } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename internal::traits::template MakePointer::RefType - coeffRef(const array& coords) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array& coords) { eigen_assert(m_data); if (static_cast(Layout) == static_cast(ColMajor)) { return m_data[m_dims.IndexOfColMajor(coords)]; @@ -157,8 +152,6 @@ struct TensorEvaluator typedef typename Derived::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; typedef typename Derived::Dimensions Dimensions; - typedef const Derived XprType; - // NumDimensions is -1 for variable dim tensors static const int NumCoords = internal::traits::NumDimensions > 0 ? diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index f060191..08eb559 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -253,7 +253,7 @@ struct TensorEvaluator, D // get data into line_buf const Index stride = m_strides[dim]; if (stride == 1) { - m_device.memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar)); + memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar)); } else { Index offset = base_offset; for (int j = 0; j < line_len; ++j, offset += stride) { @@ -271,7 +271,7 @@ struct TensorEvaluator, D // write back if (FFTDir == FFT_FORWARD && stride == 1) { - m_device.memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar)); + memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar)); } else { Index offset = base_offset; const ComplexScalar div_factor = ComplexScalar(1.0 / line_len, 0); diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index abe85c8..bbd5eb3 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -26,8 +26,8 @@ namespace Eigen { /// Therefore, by adding the default value, we managed to convert the type and it does not break any /// existing code as its default value is T*. namespace internal { -template -struct traits > +template class MakePointer_> +struct traits > { // Type promotion to handle the case where the types of the lhs and the rhs are different. typedef typename XprType::Scalar Scalar; @@ -42,26 +42,31 @@ struct traits > enum { Flags = 0 }; + template struct MakePointer { + // Intermediate typedef to workaround MSVC issue. + typedef MakePointer_ MakePointerT; + typedef typename MakePointerT::Type Type; + }; }; -template -struct eval, Eigen::Dense> +template class MakePointer_> +struct eval, Eigen::Dense> { - typedef const TensorForcedEvalOp& type; + typedef const TensorForcedEvalOp& type; }; -template -struct nested, 1, typename eval >::type> +template class MakePointer_> +struct nested, 1, typename eval >::type> { - typedef TensorForcedEvalOp type; + typedef TensorForcedEvalOp type; }; } // end namespace internal -template -class TensorForcedEvalOp : public TensorBase, ReadOnlyAccessors> +template class MakePointer_> +class TensorForcedEvalOp : public TensorBase, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; @@ -83,10 +88,10 @@ class TensorForcedEvalOp : public TensorBase, ReadOn }; -template -struct TensorEvaluator, Device> +template class MakePointer_> +struct TensorEvaluator, Device> { - typedef TensorForcedEvalOp XprType; + typedef TensorForcedEvalOp XprType; typedef typename ArgType::Scalar Scalar; typedef typename TensorEvaluator::Dimensions Dimensions; typedef typename XprType::Index Index; @@ -102,7 +107,7 @@ struct TensorEvaluator, Device> }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - /// op_ is used for sycl + /// op_ is used for sycl : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) { } @@ -143,17 +148,17 @@ struct TensorEvaluator, Device> return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return m_buffer; } + EIGEN_DEVICE_FUNC typename MakePointer::Type data() const { return m_buffer; } /// required by sycl in order to extract the sycl accessor - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() { return m_impl; } + const TensorEvaluator& impl() { return m_impl; } /// used by sycl in order to build the sycl buffer - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;} + const Device& device() const{return m_device;} private: TensorEvaluator m_impl; const ArgType m_op; const Device& m_device; - CoeffReturnType* m_buffer; + typename MakePointer::Type m_buffer; }; diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 2e63899..52b803d 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -20,19 +20,7 @@ namespace Eigen { // map_allocator. template struct MakePointer { typedef T* Type; - typedef T& RefType; }; -#if defined(EIGEN_USE_SYCL) -namespace TensorSycl { -namespace internal{ -template class ReductionFunctor; -template -class FullReductionKernelFunctor; -} -} -#endif - - template class MakePointer_ = MakePointer> class TensorMap; template class Tensor; @@ -75,7 +63,7 @@ template class TensorCustomUnaryOp; template class TensorCustomBinaryOp; template class MakePointer_ = MakePointer> class TensorEvalToOp; -template class TensorForcedEvalOp; +template class MakePointer_ = MakePointer> class TensorForcedEvalOp; template class TensorDevice; template struct TensorEvaluator; diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 3b4f8ed..d73f6dc 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -33,7 +33,7 @@ struct functor_traits > */ template struct scalar_mod2_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op) + EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op); EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; } }; template @@ -42,7 +42,7 @@ struct functor_traits > template struct scalar_fmod_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op) + EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op); EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const { return numext::fmod(a, b); diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index ef1c9c4..ede3939 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -37,8 +37,6 @@ namespace { { #ifdef __CUDA_ARCH__ return __clz(val); -#elif defined(__SYCL_DEVICE_ONLY__) - return cl::sycl::clz(val); #elif EIGEN_COMP_MSVC unsigned long index; _BitScanReverse(&index, val); @@ -55,8 +53,6 @@ namespace { { #ifdef __CUDA_ARCH__ return __clzll(val); -#elif defined(__SYCL_DEVICE_ONLY__) - return cl::sycl::clz(val); #elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64 unsigned long index; _BitScanReverse64(&index, val); @@ -92,8 +88,6 @@ namespace { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) { #if defined(__CUDA_ARCH__) return __umulhi(a, b); -#elif defined(__SYCL_DEVICE_ONLY__) - return cl::sycl::mul_hi(a, static_cast(b)); #else return (static_cast(a) * b) >> 32; #endif @@ -103,8 +97,6 @@ namespace { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { #if defined(__CUDA_ARCH__) return __umul64hi(a, b); -#elif defined(__SYCL_DEVICE_ONLY__) - return cl::sycl::mul_hi(a, static_cast(b)); #elif defined(__SIZEOF_INT128__) __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b); return static_cast(v >> 64); @@ -124,7 +116,7 @@ namespace { template struct DividerHelper<64, T> { static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) { -#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__) +#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__) return static_cast((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1); #else const uint64_t shift = 1ULL << log_div; @@ -205,8 +197,6 @@ class TensorIntDivisor { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const { #ifdef __CUDA_ARCH__ return (__umulhi(magic, n) >> shift); -#elif defined(__SYCL_DEVICE_ONLY__) - return (cl::sycl::mul_hi(static_cast(magic), static_cast(n)) >> shift); #else uint64_t v = static_cast(magic) * static_cast(n); return (static_cast(v >> 32) >> shift); diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h index f92e39d..ee0078b 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h @@ -51,12 +51,4 @@ #endif -#if EIGEN_OS_WIN || EIGEN_OS_WIN64 -#define EIGEN_SLEEP(n) Sleep(n) -#elif EIGEN_OS_GNULINUX -#define EIGEN_SLEEP(n) usleep(n * 1000); -#else -#define EIGEN_SLEEP(n) sleep(std::max(1, n/1000)) -#endif - #endif diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index b5ef31d..615559d 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -75,7 +75,6 @@ struct PacketType { HasSqrt = 1, HasRsqrt = 1, HasExp = 1, - HasExpm1 = 0, HasLog = 1, HasLog1p = 0, HasLog10 = 0, @@ -169,12 +168,12 @@ template struct IndexPair { #ifdef EIGEN_HAS_SFINAE namespace internal { - template + template EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array customIndices2Array(IndexType& idx, numeric_list) { return { idx[Is]... }; } - template + template EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array customIndices2Array(IndexType&, numeric_list) { return array(); diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index 6ddd2ca..d34f1e3 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -299,16 +299,6 @@ template struct MemcpyTriggerForSlicing { EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; } }; #endif - -// It is very expensive to start the memcpy kernel on GPU: we therefore only -// use it for large copies. -#ifdef EIGEN_USE_SYCL -template struct MemcpyTriggerForSlicing { - EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) { } - EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; } -}; -#endif - } // Eval as rvalue @@ -503,14 +493,7 @@ struct TensorEvaluator, Devi } return NULL; } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const{ - return m_impl; - } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& startIndices() const{ - return m_offsets; - } + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { @@ -711,12 +694,6 @@ struct TensorEvaluator XprType; static const int NumDims = internal::array_size::value; - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename internal::remove_const::type ScalarNonConst; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef Strides Dimensions; enum { // Alignment can't be guaranteed at compile time since it depends on the @@ -729,7 +706,7 @@ struct TensorEvaluator startIndicesClamped, stopIndicesClamped; @@ -739,7 +716,7 @@ struct TensorEvaluator::type ScalarNonConst; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef Strides Dimensions; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } @@ -827,15 +811,6 @@ struct TensorEvaluator& impl() const{return m_impl;} - protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { @@ -857,11 +832,7 @@ struct TensorEvaluator m_outputStrides; @@ -874,10 +845,6 @@ struct TensorEvaluator m_offsets; // offset in a flattened shape const Strides m_strides; std::size_t m_block_total_size_max; - //use by sycl - const StartIndices m_exprStartIndices; - //use by sycl - const StopIndices m_exprStopIndices; }; // Eval as lvalue diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index a8e2552..647bcf1 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -200,13 +200,6 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PaddingDimensions& padding() const { return m_padding; } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& padding_value() const { return m_paddingValue; } - /// used by sycl - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const{return m_impl;} - private: EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim( Index index, int dim_index) const { diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index e341e2e..41d0d00 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -11,20 +11,8 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H -// clang is incompatible with the CUDA syntax wrt making a kernel a class friend, -// so we'll use a macro to make clang happy. -#ifndef KERNEL_FRIEND -#if defined(__clang__) && defined(__CUDA__) -#define KERNEL_FRIEND friend __global__ -#else -#define KERNEL_FRIEND friend -#endif -#endif - - namespace Eigen { - /** \class TensorReduction * \ingroup CXX11_Tensor_Module * @@ -692,23 +680,17 @@ struct TensorEvaluator, template friend struct internal::FullReducerShard; #endif #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) - template KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); + template friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); #ifdef EIGEN_HAS_CUDA_FP16 - template KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); - template KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); - template KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*); -#endif - template KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); - - template KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); + template friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); + template friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); + template friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*); #endif + template friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); -#if defined(EIGEN_USE_SYCL) - template < typename HostExpr_, typename FunctorExpr_, typename Tuple_of_Acc_, typename Dims_, typename Op_, typename Index_> friend class TensorSycl::internal::ReductionFunctor; - template friend class TensorSycl::internal::FullReductionKernelFunctor; + template friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); #endif - template friend struct internal::InnerReducer; // Returns the Index in the input tensor of the first value that needs to be diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index edb0ab2..65638b6 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -287,6 +287,7 @@ struct FullReductionLauncher< void>::type> { static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) { typedef typename Self::Index Index; + typedef typename Self::CoeffReturnType Scalar; const int block_size = 256; const int num_per_thread = 128; const int num_blocks = divup(num_coeffs, block_size * num_per_thread); diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index c3ca129..3daecb0 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -25,28 +25,61 @@ namespace Eigen { namespace internal { -template struct syclGenericBufferReducer{ +template struct syclGenericBufferReducer{ template -static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ +static void run(BufferTOut* bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ do { - auto f = [length, local, op, &bufOut, &bufI](cl::sycl::handler& h) mutable { + auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable { cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)}, cl::sycl::range<1>{std::min(length, local)}}; /* Two accessors are used: one to the buffer that is being reduced, * and a second to local memory, used to store intermediate data. */ - auto aI =bufI.template get_access(h); - auto aOut =bufOut.template get_access(h); - typedef decltype(aI) InputAccessor; - typedef decltype(aOut) OutputAccessor; - typedef cl::sycl::accessor LocalAccessor; - LocalAccessor scratch(cl::sycl::range<1>(local), h); + auto aI = + bufI.template get_access(h); + auto aOut = + bufOut->template get_access(h); + cl::sycl::accessor + scratch(cl::sycl::range<1>(local), h); /* The parallel_for invocation chosen is the variant with an nd_item * parameter, since the code requires barriers for correctness. */ - h.parallel_for(r, TensorSycl::internal::GenericKernelReducer(op, aOut, aI, scratch, length, local)); + h.parallel_for( + r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) { + size_t globalid = id.get_global(0); + size_t localid = id.get_local(0); + /* All threads collectively read from global memory into local. + * The barrier ensures all threads' IO is resolved before + * execution continues (strictly speaking, all threads within + * a single work-group - there is no co-ordination between + * work-groups, only work-items). */ + if (globalid < length) { + scratch[localid] = aI[globalid]; + } + id.barrier(cl::sycl::access::fence_space::local_space); + + /* Apply the reduction operation between the current local + * id and the one on the other half of the vector. */ + if (globalid < length) { + int min = (length < local) ? length : local; + for (size_t offset = min / 2; offset > 0; offset /= 2) { + if (localid < offset) { + scratch[localid] += scratch[localid + offset]; + } + id.barrier(cl::sycl::access::fence_space::local_space); + } + /* The final result will be stored in local id 0. */ + if (localid == 0) { + aI[id.get_group(0)] = scratch[localid]; + if((length<=local) && globalid ==0){ + aOut[globalid]=scratch[localid]; + } + } + } + }); }; - dev.sycl_queue().submit(f); - dev.asynchronousExec(); + dev.m_queue.submit(f); + dev.m_queue.throw_asynchronous(); /* At this point, you could queue::wait_and_throw() to ensure that * errors are caught quickly. However, this would likely impact @@ -54,23 +87,18 @@ static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDev length = length / local; } while (length > 1); -} -}; -template struct syclGenericBufferReducer, CoeffReturnType>{ -template -static void run(Eigen::internal::MeanReducer, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){ - syclGenericBufferReducer, CoeffReturnType>::run(Eigen::internal::SumReducer(), - bufOut, bufI, dev, length, local); + } + }; +/// For now let's start with a full reducer /// Self is useless here because in expression construction we are going to treat reduction as a leafnode. /// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the /// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as // a leafNode. - template struct FullReducer { @@ -79,8 +107,8 @@ struct FullReducer { static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) { typedef const typename Self::ChildType HostExpr; /// this is the child of reduction - typedef Eigen::TensorSycl::internal::FunctorExtractor > FunctorExpr; - FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl()); + typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; + auto functors = TensorSycl::internal::extractFunctors(self.impl()); int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread. size_t inputSize =self.impl().dimensions().TotalSize(); size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input @@ -88,7 +116,7 @@ struct FullReducer { if(rng ==0) { red_factor=1; }; - size_t tileSize =dev.sycl_queue().get_device(). template get_info()/2; + size_t tileSize =dev.m_queue.get_device(). template get_info()/2; size_t GRange=std::max((size_t )1, rng); // convert global range to power of 2 for redecution @@ -105,66 +133,105 @@ struct FullReducer { size_t outTileSize = tileSize; /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one. if (GRange < outTileSize) outTileSize=GRange; + // getting final out buffer at the moment the created buffer is true because there is no need for assign + auto out_buffer =dev.template get_sycl_buffer::type>(self.dimensions().TotalSize(), output); /// creating the shared memory for calculating reduction. /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can /// recursively apply reduction on it in order to reduce the whole. auto temp_global_buffer =cl::sycl::buffer(cl::sycl::range<1>(GRange)); typedef typename Eigen::internal::remove_all::type Dims; - // Dims dims= self.xprDims(); - //Op functor = reducer; - dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { - // this is a workaround for gcc 4.8 bug - typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) TupleType; + Dims dims= self.xprDims(); + Op functor = reducer; + dev.m_queue.submit([&](cl::sycl::handler &cgh) { // create a tuple of accessors from Evaluator - TupleType tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); + auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); auto tmp_global_accessor = temp_global_buffer. template get_access(cgh); - typedef decltype(tmp_global_accessor) OutAccessor; - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), - TensorSycl::internal::FullReductionKernelFunctor - (tmp_global_accessor, rng, remaining, red_factor, reducer, self.xprDims(), functors, tuple_of_accessors)); + + cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) { + typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; + auto device_expr = TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); + /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour + /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the + /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here. + const auto device_self_expr= TensorReductionOp(device_expr.expr, dims, functor); + /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is + /// the device_evaluator is detectable and recognisable on the device. + auto device_self_evaluator = Eigen::TensorEvaluator(device_self_expr, Eigen::DefaultDevice()); + /// const cast added as a naive solution to solve the qualifier drop error + auto globalid=itemID.get_global_linear_id(); + + if(globalid::reduce(device_self_evaluator, red_factor*globalid, red_factor, const_cast(functor)); + else + tmp_global_accessor.get_pointer()[globalid]=static_cast(0); + + if(remaining!=0 && globalid==0 ) + // this will add the rest of input buffer when the input size is not devidable to red_factor. + tmp_global_accessor.get_pointer()[globalid]+=InnerMostDimReducer::reduce(device_self_evaluator, red_factor*(rng), remaining, const_cast(functor)); + }); }); - dev.asynchronousExec(); + dev.m_queue.throw_asynchronous(); - // getting final out buffer at the moment the created buffer is true because there is no need for assign - auto out_buffer =dev.get_sycl_buffer(output); - /// This is used to recursively reduce the tmp value to an element of 1; - syclGenericBufferReducer::run(reducer, out_buffer, temp_global_buffer,dev, GRange, outTileSize); +/// This is used to recursively reduce the tmp value to an element of 1; + syclGenericBufferReducer::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize); } }; - template struct InnerReducer { typedef typename Self::CoeffReturnType CoeffReturnType; static const bool HasOptimizedImplementation = false; - static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index num_values_to_reduce, typename Self::Index num_coeffs_to_preserve) { + static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) { typedef const typename Self::ChildType HostExpr; /// this is the child of reduction - typedef Eigen::TensorSycl::internal::FunctorExtractor > FunctorExpr; - FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl()); - typename Self::Index range, GRange, tileSize; - typedef typename Eigen::internal::remove_all::type Dims; + typedef typename TensorSycl::internal::createPlaceHolderExpression::Type PlaceHolderExpr; + auto functors = TensorSycl::internal::extractFunctors(self.impl()); + + size_t tileSize =dev.m_queue.get_device(). template get_info()/2; + size_t GRange=num_coeffs_to_preserve; + if (tileSize>GRange) tileSize=GRange; + else if(GRange>tileSize){ + size_t xMode = GRange % tileSize; + if (xMode != 0) GRange += (tileSize - xMode); + } // getting final out buffer at the moment the created buffer is true because there is no need for assign /// creating the shared memory for calculating reduction. /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can /// recursively apply reduction on it in order to reduce the whole. - dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange); - dev.sycl_queue().submit([&](cl::sycl::handler &cgh) { - // this is workaround for gcc 4.8 bug. - typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc; - // create a tuple of accessors from Evaluator - Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); - auto output_accessor = dev.template get_sycl_accessor(cgh, output); - Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast(1); - cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), - TensorSycl::internal::ReductionFunctor - (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size)); + typedef typename Eigen::internal::remove_all::type Dims; + Dims dims= self.xprDims(); + Op functor = reducer; + dev.m_queue.submit([&](cl::sycl::handler &cgh) { + // create a tuple of accessors from Evaluator + auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl()); + auto output_accessor = dev.template get_sycl_accessor(num_coeffs_to_preserve,cgh, output); + + cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) { + typedef typename TensorSycl::internal::ConvertToDeviceExpression::Type DevExpr; + auto device_expr = TensorSycl::internal::createDeviceExpression(functors, tuple_of_accessors); + /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour + /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the + /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here. + const auto device_self_expr= TensorReductionOp(device_expr.expr, dims, functor); + /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is + /// the device_evaluator is detectable and recognisable on the device. + typedef Eigen::TensorEvaluator DeiceSelf; + auto device_self_evaluator = Eigen::TensorEvaluator(device_self_expr, Eigen::DefaultDevice()); + /// const cast added as a naive solution to solve the qualifier drop error + auto globalid=itemID.get_global_linear_id(); + if (globalid< static_cast(num_coeffs_to_preserve)) { + typename DeiceSelf::CoeffReturnType accum = functor.initialize(); + GenericDimReducer::reduce(device_self_evaluator, device_self_evaluator.firstInput(globalid),const_cast(functor), &accum); + functor.finalize(accum); + output_accessor.get_pointer()[globalid]= accum; + } + }); }); - dev.asynchronousExec(); + dev.m_queue.throw_asynchronous(); return false; } }; diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index e430b08..14e392e 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -224,11 +224,6 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - /// required by sycl in order to extract the accessor - const TensorEvaluator & impl() const { return m_impl; } - /// added for sycl in order to construct the buffer from sycl device - ReverseDimensions functor() const { return m_reverse; } - protected: Dimensions m_dimensions; array m_strides; diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index edc9dd3..113c060 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -117,7 +117,7 @@ struct TensorEvaluator, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_shuffle(op.shufflePermutation()) + : m_impl(op.expression(), device) { const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); const Shuffle& shuffle = op.shufflePermutation(); @@ -187,11 +187,6 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - // required by sycl - EIGEN_STRONG_INLINE const Shuffle& shufflePermutation() const {return m_shuffle;} - // required by sycl - EIGEN_STRONG_INLINE const TensorEvaluator& impl() const {return m_impl;} - protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; @@ -211,12 +206,11 @@ struct TensorEvaluator, Device> return inputIndex + index * m_inputStrides[NumDims - 1]; } } + Dimensions m_dimensions; array m_outputStrides; array m_inputStrides; TensorEvaluator m_impl; - /// required by sycl - Shuffle m_shuffle; }; diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index e6a666f..2854a4a 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -31,12 +31,12 @@ namespace Eigen { * * \sa Tensor */ -template class TensorStorage; +template class TensorStorage; // Pure fixed-size storage -template -class TensorStorage +template +class TensorStorage { private: static const std::size_t Size = FixedDimensions::total_size; @@ -66,7 +66,7 @@ class TensorStorage // pure dynamic -template +template class TensorStorage, Options_> { public: diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 2237140..6c35bfd 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -117,11 +117,11 @@ struct TensorEvaluator, Device> }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_strides(op.strides()) + : m_impl(op.expression(), device) { m_dimensions = m_impl.dimensions(); for (int i = 0; i < NumDims; ++i) { - m_dimensions[i] =Eigen::numext::ceil(static_cast(m_dimensions[i]) / op.strides()[i]); + m_dimensions[i] = ceilf(static_cast(m_dimensions[i]) / op.strides()[i]); } const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); @@ -224,11 +224,6 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& impl() const { return m_impl; } - /// required by sycl in order to extract the accessor - Strides functor() const { return m_strides; } - protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { @@ -255,9 +250,9 @@ struct TensorEvaluator, Device> array m_outputStrides; array m_inputStrides; TensorEvaluator m_impl; - const Strides m_strides; }; + // Eval as lvalue template struct TensorEvaluator, Device> @@ -291,11 +286,6 @@ struct TensorEvaluator, Device> return this->m_impl.coeffRef(this->srcCoeff(index)); } - /// required by sycl in order to extract the accessor - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return this->m_impl; } - /// required by sycl in order to extract the accessor - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Strides functor() const { return this->m_strides; } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h index 9d5a6d4..bb8800d 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h @@ -20,14 +20,12 @@ template struct MakeGlobalPointer { typedef typename cl::sycl::global_ptr::pointer_t Type; - typedef typename cl::sycl::global_ptr::reference_t RefType; }; // global pointer to set different attribute state for a class template struct MakeLocalPointer { typedef typename cl::sycl::local_ptr::pointer_t Type; - typedef typename cl::sycl::local_ptr::reference_t RefType; }; @@ -35,9 +33,6 @@ namespace Eigen { namespace TensorSycl { namespace internal { - template struct GenericKernelReducer; - - /// This struct is used for special expression nodes with no operations (for example assign and selectOP). struct NoOP; @@ -80,15 +75,8 @@ template struct GetType{ /// this is used for extracting tensor reduction #include "TensorReductionSycl.h" -/// this is used for extracting tensor convolution -#include "TensorConvolutionSycl.h" - // kernel execution using fusion #include "TensorSyclRun.h" -//sycl functors -#include "TensorSyclFunctors.h" - -#include "TensorContractionSycl.h" #endif // end of EIGEN_USE_SYCL #endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h index ee8f3c9..8729c86 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h @@ -48,9 +48,9 @@ struct DeviceConvertor{ /// specialisation of the \ref ConvertToDeviceExpression struct when the node /// type is TensorMap #define TENSORMAPCONVERT(CVQual)\ -template class MakePointer_>\ -struct ConvertToDeviceExpression > {\ - typedef CVQual TensorMap Type;\ +template class MakePointer_>\ +struct ConvertToDeviceExpression, Options2_, MakePointer_> > {\ + typedef CVQual TensorMap, Options2_, MakeGlobalPointer> Type;\ }; TENSORMAPCONVERT(const) @@ -97,18 +97,8 @@ template \ struct ConvertToDeviceExpression > \ : DeviceConvertor{}; -/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorForcedEvalOp -#define KERNELBROKERCONVERTFORCEDEVAL(CVQual)\ -template \ -struct ConvertToDeviceExpression > {\ - typedef CVQual TensorForcedEvalOp< typename ConvertToDeviceExpression::Type> Type;\ -}; -KERNELBROKERCONVERTFORCEDEVAL(const) -KERNELBROKERCONVERTFORCEDEVAL() -#undef KERNELBROKERCONVERTFORCEDEVAL - - - +KERNELBROKERCONVERT(const, true, TensorForcedEvalOp) +KERNELBROKERCONVERT(, false, TensorForcedEvalOp) KERNELBROKERCONVERT(const, true, TensorEvalToOp) KERNELBROKERCONVERT(, false, TensorEvalToOp) #undef KERNELBROKERCONVERT @@ -124,40 +114,6 @@ KERNELBROKERCONVERTREDUCTION(const) KERNELBROKERCONVERTREDUCTION() #undef KERNELBROKERCONVERTREDUCTION -#define KERNELBROKERCONVERTSLICEOP(CVQual)\ -template\ -struct ConvertToDeviceExpression >{\ - typedef CVQual TensorSlicingOp::Type> Type;\ -}; - -KERNELBROKERCONVERTSLICEOP(const) -KERNELBROKERCONVERTSLICEOP() -#undef KERNELBROKERCONVERTSLICEOP - - -#define KERNELBROKERCONVERTERSLICESTRIDEOP(CVQual)\ -template\ -struct ConvertToDeviceExpression >{\ - typedef CVQual TensorStridingSlicingOp::Type> Type;\ -}; - -KERNELBROKERCONVERTERSLICESTRIDEOP(const) -KERNELBROKERCONVERTERSLICESTRIDEOP() -#undef KERNELBROKERCONVERTERSLICESTRIDEOP - - -/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorChippingOp -#define KERNELBROKERCONVERTCHIPPINGOP(CVQual)\ -template \ -struct ConvertToDeviceExpression > {\ - typedef CVQual TensorChippingOp::Type> Type;\ -}; -KERNELBROKERCONVERTCHIPPINGOP(const) -KERNELBROKERCONVERTCHIPPINGOP() -#undef KERNELBROKERCONVERTCHIPPINGOP - - - } // namespace internal } // namespace TensorSycl } // namespace Eigen diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h index 3b83b1d..7ed3a3a 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h @@ -25,21 +25,12 @@ namespace Eigen { namespace TensorSycl { namespace internal { - -template -struct DeviceFixedSizeTensor; - -template -struct DeviceFixedSizeTensor>{ - template - static EIGEN_ALWAYS_INLINE Expr instantiate(Data& dt) {return Expr(ConvertToActualTypeSycl(typename Expr::Scalar, dt), Indices...);} -}; /// this class is used by EvalToOp in order to create an lhs expression which is /// a pointer from an accessor on device-only buffer template struct EvalToLHSConstructor { PtrType expr; - EvalToLHSConstructor(const utility::tuple::Tuple &t) : expr(ConvertToActualTypeSycl(typename Eigen::internal::remove_all::type, utility::tuple::get(t))) {} + EvalToLHSConstructor(const utility::tuple::Tuple &t): expr((&(*(utility::tuple::get(t).get_pointer())))) {} }; /// \struct ExprConstructor is used to reconstruct the expression on the device and @@ -54,39 +45,21 @@ struct ExprConstructor; /// specialisation of the \ref ExprConstructor struct when the node type is /// TensorMap #define TENSORMAP(CVQual)\ -template class MakePointer_, size_t N, typename... Params>\ -struct ExprConstructor< CVQual TensorMap,\ -CVQual PlaceHolder, N>, Params...>{\ - typedef CVQual TensorMap Type;\ +struct ExprConstructor< CVQual TensorMap, Options2_, MakeGlobalPointer>,\ +CVQual PlaceHolder, Options3_, MakePointer_>, N>, Params...>{\ + typedef CVQual TensorMap, Options2_, MakeGlobalPointer> Type;\ Type expr;\ template \ ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple &t)\ - : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get(t)), fd.dimensions())){}\ + : expr(Type((&(*(utility::tuple::get(t).get_pointer()))), fd.dimensions())) {}\ }; - TENSORMAP(const) TENSORMAP() #undef TENSORMAP -/// specialisation of the \ref ExprConstructor struct when the node type is -/// TensorMap -#define TENSORMAPFIXEDSIZE(CVQual)\ -template class MakePointer_, size_t N, typename... Params>\ -struct ExprConstructor< CVQual TensorMap, Options_, MakeGlobalPointer>,\ -CVQual PlaceHolder, Options_, MakePointer_>, N>, Params...>{\ - typedef CVQual TensorMap, Options_, MakeGlobalPointer> Type;\ - Type expr;\ - template \ - ExprConstructor(FuncDetector &, const utility::tuple::Tuple &t)\ - : expr(DeviceFixedSizeTensor::instantiate(utility::tuple::get(t))){}\ -}; -TENSORMAPFIXEDSIZE(const) -TENSORMAPFIXEDSIZE() -#undef TENSORMAPFIXEDSIZE - #define UNARYCATEGORY(CVQual)\ template class UnaryCategory, typename OP, typename OrigRHSExpr, typename RHSExpr, typename... Params>\ struct ExprConstructor, CVQual UnaryCategory, Params...> {\ @@ -188,30 +161,8 @@ struct ExprConstructor, CVQual ASSIGN(const) ASSIGN() #undef ASSIGN - - - - - /// specialisation of the \ref ExprConstructor struct when the node type is - /// const TensorAssignOp - #define CONVERSIONEXPRCONST(CVQual)\ - template \ - struct ExprConstructor, CVQual TensorConversionOp, Params...> {\ - typedef ExprConstructor my_nested_type;\ - typedef CVQual TensorConversionOp Type;\ - my_nested_type nestedExpr;\ - Type expr;\ - template \ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple &t)\ - : nestedExpr(funcD.subExpr, t), expr(nestedExpr.expr) {}\ - }; - - CONVERSIONEXPRCONST(const) - CONVERSIONEXPRCONST() - #undef CONVERSIONEXPRCONST - /// specialisation of the \ref ExprConstructor struct when the node type is -/// TensorEvalToOp /// 0 here is the output number in the buffer +/// TensorEvalToOp #define EVALTO(CVQual)\ template \ struct ExprConstructor, CVQual TensorEvalToOp, Params...> {\ @@ -234,14 +185,14 @@ EVALTO() /// TensorForcedEvalOp #define FORCEDEVAL(CVQual)\ template \ -struct ExprConstructor,\ +struct ExprConstructor,\ CVQual PlaceHolder, N>, Params...> {\ - typedef CVQual TensorMap::Scalar,\ - TensorForcedEvalOp::NumDimensions, Eigen::internal::traits>::Layout, typename TensorForcedEvalOp::Index>, Eigen::internal::traits>::Layout, MakeGlobalPointer> Type;\ + typedef CVQual TensorMap::Scalar,\ + TensorForcedEvalOp::NumDimensions, 0, typename TensorForcedEvalOp::Index>, 0, MakeGlobalPointer> Type;\ Type expr;\ template \ ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple &t)\ - : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get(t)), fd.dimensions())) {}\ + : expr(Type((&(*(utility::tuple::get(t).get_pointer()))), fd.dimensions())) {}\ }; FORCEDEVAL(const) @@ -262,130 +213,17 @@ struct ExprConstructor, N>, Params...> {\ static const size_t NumIndices= ValueCondition< TensorReductionOp::NumDimensions==0, 1, TensorReductionOp::NumDimensions >::Res;\ typedef CVQual TensorMap::Scalar,\ - NumIndices, Eigen::internal::traits>::Layout, typename TensorReductionOp::Index>, Eigen::internal::traits>::Layout, MakeGlobalPointer> Type;\ + NumIndices, 0, typename TensorReductionOp::Index>, 0, MakeGlobalPointer> Type;\ Type expr;\ template \ ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple &t)\ - :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get(t)), fd.dimensions())) {}\ + : expr(Type((&(*(utility::tuple::get(t).get_pointer()))), fd.dimensions())) {}\ }; SYCLREDUCTIONEXPR(const) SYCLREDUCTIONEXPR() #undef SYCLREDUCTIONEXPR - -/// specialisation of the \ref ExprConstructor struct when the node type is -/// TensorContractionOp -#define SYCLCONTRACTIONCONVOLUTION(CVQual, ExprNode)\ -template \ -struct ExprConstructor,\ -CVQual PlaceHolder, N>, Params...> {\ - static const size_t NumIndices= Eigen::internal::traits >::NumDimensions;\ - typedef CVQual TensorMap::Scalar,\ - NumIndices, Eigen::internal::traits >::Layout,\ - typename ExprNode::Index>,\ - Eigen::internal::traits>::Layout, MakeGlobalPointer> Type;\ - Type expr;\ - template \ - ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple &t)\ - :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get(t)), fd.dimensions())) {}\ -}; - -SYCLCONTRACTIONCONVOLUTION(const, TensorContractionOp) -SYCLCONTRACTIONCONVOLUTION(, TensorContractionOp) -SYCLCONTRACTIONCONVOLUTION(const, TensorConvolutionOp) -SYCLCONTRACTIONCONVOLUTION(, TensorConvolutionOp) -#undef SYCLCONTRACTIONCONVOLUTION - - - -#define SYCLSLICEOPEXPR(CVQual)\ -template\ -struct ExprConstructor , CVQual TensorSlicingOp, Params... >{\ - typedef ExprConstructor my_xpr_type;\ - typedef CVQual TensorSlicingOp Type;\ - my_xpr_type xprExpr;\ - Type expr;\ - template \ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple &t)\ - : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.startIndices(), funcD.dimensions()) {}\ -}; - -SYCLSLICEOPEXPR(const) -SYCLSLICEOPEXPR() -#undef SYCLSLICEOPEXPR - - -#define SYCLSLICESTRIDEOPEXPR(CVQual)\ -template\ -struct ExprConstructor, CVQual TensorStridingSlicingOp, Params... >{\ - typedef ExprConstructor my_xpr_type;\ - typedef CVQual TensorStridingSlicingOp Type;\ - my_xpr_type xprExpr;\ - Type expr;\ - template \ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple &t)\ - : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.startIndices(), funcD.stopIndices(),funcD.strides()) {}\ -}; - -SYCLSLICESTRIDEOPEXPR(const) -SYCLSLICESTRIDEOPEXPR() -#undef SYCLSLICESTRIDEOPEXPR - -#define SYCLRESHAPEANDSHUFFLEOPEXPRCONST(OPEXPR, CVQual)\ -template\ -struct ExprConstructor , CVQual OPEXPR , Params... >{\ - typedef ExprConstructor my_xpr_type;\ - typedef CVQual OPEXPR Type ;\ - my_xpr_type xprExpr;\ - Type expr;\ - template \ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple &t)\ - : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param()) {}\ -}; - -SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, const) -SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, ) - -SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, const) -SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, ) -#undef SYCLRESHAPEANDSHUFFLEOPEXPRCONST - -#define SYCLPADDINGOPEXPRCONST(OPEXPR, CVQual)\ -template\ -struct ExprConstructor , CVQual OPEXPR , Params... >{\ - typedef ExprConstructor my_xpr_type;\ - typedef CVQual OPEXPR Type ;\ - my_xpr_type xprExpr;\ - Type expr;\ - template \ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple &t)\ - : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param() , funcD.scalar_param()) {}\ -}; - -SYCLPADDINGOPEXPRCONST(TensorPaddingOp, const) -SYCLPADDINGOPEXPRCONST(TensorPaddingOp, ) -#undef SYCLPADDINGOPEXPRCONST - - -// TensorChippingOp -#define SYCLTENSORCHIPPINGOPEXPR(CVQual)\ -template\ -struct ExprConstructor , CVQual TensorChippingOp, Params... >{\ - typedef ExprConstructor my_xpr_type;\ - typedef CVQual TensorChippingOp Type;\ - my_xpr_type xprExpr;\ - Type expr;\ - template \ - ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple &t)\ - : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.offset(), funcD.dimId()) {}\ -}; - -SYCLTENSORCHIPPINGOPEXPR(const) -SYCLTENSORCHIPPINGOPEXPR() -#undef SYCLTENSORCHIPPINGOPEXPR - - /// template deduction for \ref ExprConstructor struct template auto createDeviceExpression(FuncD &funcD, const utility::tuple::Tuple &t) diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h index b512d43..b1da685 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h @@ -35,8 +35,6 @@ namespace Eigen { namespace TensorSycl { namespace internal { -#define RETURN_CPP11(expr) ->decltype(expr) {return expr;} - /// \struct ExtractAccessor: Extract Accessor Class is used to extract the /// accessor from a buffer. /// Depending on the type of the leaf node we can get a read accessor or a @@ -45,192 +43,159 @@ template struct ExtractAccessor; struct AccessorConstructor{ - template static inline auto getTuple(cl::sycl::handler& cgh, const Arg& eval) - RETURN_CPP11(ExtractAccessor::getTuple(cgh, eval)) - - template static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1, const Arg2& eval2) - RETURN_CPP11(utility::tuple::append(ExtractAccessor::getTuple(cgh, eval1), ExtractAccessor::getTuple(cgh, eval2))) - - template static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1 , const Arg2& eval2 , const Arg3& eval3) - RETURN_CPP11(utility::tuple::append(ExtractAccessor::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor::getTuple(cgh, eval2), ExtractAccessor::getTuple(cgh, eval3)))) - - template< cl::sycl::access::mode AcM, typename Arg> static inline auto getAccessor(cl::sycl::handler& cgh, const Arg& eval) - RETURN_CPP11(utility::tuple::make_tuple(eval.device().template get_sycl_accessor(cgh,eval.data()))) + template static inline auto getTuple(cl::sycl::handler& cgh, Arg eval) + -> decltype(ExtractAccessor::getTuple(cgh, eval)) { + return ExtractAccessor::getTuple(cgh, eval); + } + + template static inline auto getTuple(cl::sycl::handler& cgh, Arg1 eval1, Arg2 eval2) + -> decltype(utility::tuple::append(ExtractAccessor::getTuple(cgh, eval1), ExtractAccessor::getTuple(cgh, eval2))) { + return utility::tuple::append(ExtractAccessor::getTuple(cgh, eval1), ExtractAccessor::getTuple(cgh, eval2)); + } + template static inline auto getTuple(cl::sycl::handler& cgh, Arg1 eval1 , Arg2 eval2 , Arg3 eval3) + -> decltype(utility::tuple::append(ExtractAccessor::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor::getTuple(cgh, eval2), ExtractAccessor::getTuple(cgh, eval3)))) { + return utility::tuple::append(ExtractAccessor::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor::getTuple(cgh, eval2), ExtractAccessor::getTuple(cgh, eval3))); + } + template< cl::sycl::access::mode AcM, typename Arg> static inline auto getAccessor(cl::sycl::handler& cgh, Arg eval) + -> decltype(utility::tuple::make_tuple( eval.device().template get_sycl_accessor::type>(eval.dimensions().TotalSize(), cgh,eval.data()))){ + return utility::tuple::make_tuple(eval.device().template get_sycl_accessor::type>(eval.dimensions().TotalSize(), cgh,eval.data())); + } }; /// specialisation of the \ref ExtractAccessor struct when the node type is -/// TensorCwiseNullaryOp, TensorCwiseUnaryOp and TensorBroadcastingOp -#define SYCLUNARYCATEGORYEXTACC(CVQual)\ -template class UnaryCategory, typename OP, typename RHSExpr, typename Dev>\ -struct ExtractAccessor, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ -RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ +/// const TensorCwiseNullaryOp, const TensorCwiseUnaryOp and const TensorBroadcastingOp +template class UnaryCategory, typename OP, typename RHSExpr, typename Dev> +struct ExtractAccessor, Dev> > { + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev> eval) + -> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){ + return AccessorConstructor::getTuple(cgh, eval.impl()); + } }; -SYCLUNARYCATEGORYEXTACC(const) -SYCLUNARYCATEGORYEXTACC() -#undef SYCLUNARYCATEGORYEXTACC - - -/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseBinaryOp -#define SYCLBINARYCATEGORYEXTACC(CVQual)\ -template class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>\ -struct ExtractAccessor, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\ +/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseNullaryOp, TensorCwiseUnaryOp and TensorBroadcastingOp +template class UnaryCategory, typename OP, typename RHSExpr, typename Dev> +struct ExtractAccessor, Dev> > +: ExtractAccessor, Dev> > {}; + +/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorCwiseBinaryOp +template class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev> +struct ExtractAccessor, Dev> > { + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev> eval) + -> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){ + return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()); + } }; - -SYCLBINARYCATEGORYEXTACC(const) -SYCLBINARYCATEGORYEXTACC() -#undef SYCLBINARYCATEGORYEXTACC +/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseBinaryOp +template class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev> +struct ExtractAccessor, Dev> > +: ExtractAccessor, Dev> >{}; /// specialisation of the \ref ExtractAccessor struct when the node type is /// const TensorCwiseTernaryOp -#define SYCLTERNARYCATEGORYEXTACC(CVQual)\ -template class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>\ -struct ExtractAccessor, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl()))\ +template class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev> +struct ExtractAccessor, Dev> > { + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev> eval) + -> decltype(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl())){ + return AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl()); + } }; -SYCLTERNARYCATEGORYEXTACC(const) -SYCLTERNARYCATEGORYEXTACC() -#undef SYCLTERNARYCATEGORYEXTACC +/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseTernaryOp +template class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev> +struct ExtractAccessor, Dev> > +: ExtractAccessor, Dev> >{}; +/// specialisation of the \ref ExtractAccessor struct when the node type is +/// const TensorCwiseSelectOp. This is a special case where there is no OP +template +struct ExtractAccessor, Dev> > { + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev> eval) + -> decltype(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl())){ + return AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl()); + } +}; /// specialisation of the \ref ExtractAccessor struct when the node type is /// TensorCwiseSelectOp. This is a special case where there is no OP -#define SYCLSELECTOPEXTACC(CVQual)\ -template \ -struct ExtractAccessor, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl()))\ +template +struct ExtractAccessor, Dev> > +: ExtractAccessor, Dev> >{}; + +/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorAssignOp +template +struct ExtractAccessor, Dev> > { + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev> eval) + -> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){ + return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()); + } }; -SYCLSELECTOPEXTACC(const) -SYCLSELECTOPEXTACC() -#undef SYCLSELECTOPEXTACC - /// specialisation of the \ref ExtractAccessor struct when the node type is TensorAssignOp -#define SYCLTENSORASSIGNOPEXTACC(CVQual)\ -template \ -struct ExtractAccessor, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\ -}; - - SYCLTENSORASSIGNOPEXTACC(const) - SYCLTENSORASSIGNOPEXTACC() - #undef SYCLTENSORASSIGNOPEXTACC +template +struct ExtractAccessor, Dev> > +: ExtractAccessor, Dev> >{}; /// specialisation of the \ref ExtractAccessor struct when the node type is const TensorMap #define TENSORMAPEXPR(CVQual, ACCType)\ template \ struct ExtractAccessor, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::template getAccessor(cgh, eval))\ + static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator, Dev> eval)\ + -> decltype(AccessorConstructor::template getAccessor(cgh, eval)){\ + return AccessorConstructor::template getAccessor(cgh, eval);\ + }\ }; - TENSORMAPEXPR(const, cl::sycl::access::mode::read) TENSORMAPEXPR(, cl::sycl::access::mode::read_write) #undef TENSORMAPEXPR -/// specialisation of the \ref ExtractAccessor struct when the node type is TensorForcedEvalOp -#define SYCLFORCEDEVALEXTACC(CVQual)\ -template \ -struct ExtractAccessor, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::template getAccessor(cgh, eval))\ +/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorForcedEvalOp +template +struct ExtractAccessor, Dev> > { + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev> eval) + -> decltype(AccessorConstructor::template getAccessor(cgh, eval)){ + return AccessorConstructor::template getAccessor(cgh, eval); + } }; -SYCLFORCEDEVALEXTACC(const) -SYCLFORCEDEVALEXTACC() -#undef SYCLFORCEDEVALEXTACC - +/// specialisation of the \ref ExtractAccessor struct when the node type is TensorForcedEvalOp +template +struct ExtractAccessor, Dev> > +: ExtractAccessor, Dev> >{}; + +/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorEvalToOp +template +struct ExtractAccessor, Dev> > { + static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator, Dev> eval) + -> decltype(utility::tuple::append(AccessorConstructor::template getAccessor(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()))){ + return utility::tuple::append(AccessorConstructor::template getAccessor(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl())); + } +}; /// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp -#define SYCLEVALTOEXTACC(CVQual)\ -template \ -struct ExtractAccessor, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator, Dev>& eval)\ - RETURN_CPP11(utility::tuple::append(AccessorConstructor::template getAccessor(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl())))\ +template +struct ExtractAccessor, Dev> > +: ExtractAccessor, Dev> >{}; + +/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorReductionOp +template +struct ExtractAccessor, Dev> > { + static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev> eval) + -> decltype(AccessorConstructor::template getAccessor(cgh, eval)){ + return AccessorConstructor::template getAccessor(cgh, eval); + } }; -SYCLEVALTOEXTACC(const) -SYCLEVALTOEXTACC() -#undef SYCLEVALTOEXTACC - /// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp -#define SYCLREDUCTIONEXTACC(CVQual)\ -template \ -struct ExtractAccessor, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::template getAccessor(cgh, eval))\ -}; - -SYCLREDUCTIONEXTACC(const) -SYCLREDUCTIONEXTACC() -#undef SYCLREDUCTIONEXTACC - -/// specialisation of the \ref ExtractAccessor struct when the node type is TensorContractionOp and TensorConvolutionOp -#define SYCLCONTRACTIONCONVOLUTIONEXTACC(CVQual, ExprNode)\ -template\ - struct ExtractAccessor, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::template getAccessor(cgh, eval))\ -}; - -SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorContractionOp) -SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorContractionOp) -SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorConvolutionOp) -SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorConvolutionOp) -#undef SYCLCONTRACTIONCONVOLUTIONEXTACC - - -/// specialisation of the \ref ExtractAccessor struct when the node type is -/// const TensorSlicingOp. -#define SYCLSLICEOPEXTACC(CVQual)\ -template \ -struct ExtractAccessor, Dev> > {\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ - RETURN_CPP11( AccessorConstructor::getTuple(cgh, eval.impl()))\ -}; - -SYCLSLICEOPEXTACC(const) -SYCLSLICEOPEXTACC() -#undef SYCLSLICEOPEXTACC -// specialisation of the \ref ExtractAccessor struct when the node type is -/// TensorStridingSlicingOp. -#define SYCLSLICESTRIDEOPEXTACC(CVQual)\ -template\ -struct ExtractAccessor, Dev> >{\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ -}; - -SYCLSLICESTRIDEOPEXTACC(const) -SYCLSLICESTRIDEOPEXTACC() -#undef SYCLSLICESTRIDEOPEXTACC - -// specialisation of the \ref ExtractAccessor struct when the node type is -/// TensorChippingOp. -#define SYCLTENSORCHIPPINGOPEXTACC(CVQual)\ -template\ -struct ExtractAccessor, Dev> >{\ - static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator, Dev>& eval)\ - RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\ -}; - -SYCLTENSORCHIPPINGOPEXTACC(const) -SYCLTENSORCHIPPINGOPEXTACC() -#undef SYCLTENSORCHIPPINGOPEXTACC - +template +struct ExtractAccessor, Dev> > +: ExtractAccessor, Dev> >{}; /// template deduction for \ref ExtractAccessor template -auto createTupleOfAccessors(cl::sycl::handler& cgh, const Evaluator& eval) --> decltype(ExtractAccessor::getTuple(cgh, eval)) { - return ExtractAccessor::getTuple(cgh, eval); +auto createTupleOfAccessors(cl::sycl::handler& cgh, const Evaluator& expr) +-> decltype(ExtractAccessor::getTuple(cgh, expr)) { + return ExtractAccessor::getTuple(cgh, expr); } } /// namespace TensorSycl diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h index ee02018..4271253 100644 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h +++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h @@ -36,277 +36,135 @@ namespace internal { template struct FunctorExtractor{ typedef typename Evaluator::Dimensions Dimensions; const Dimensions m_dimensions; - EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + const Dimensions& dimensions() const { return m_dimensions; } FunctorExtractor(const Evaluator& expr) : m_dimensions(expr.dimensions()) {} }; -/// specialisation of the \ref FunctorExtractor struct when the node type does not require anything -///TensorConversionOp -#define SYCLEXTRFUNCCONVERSION(ExprNode, CVQual)\ -template \ -struct FunctorExtractor, Dev> > {\ - FunctorExtractor > subExpr;\ - FunctorExtractor(const TensorEvaluator, Dev>& expr)\ - : subExpr(expr.impl()) {}\ +/// specialisation of the \ref FunctorExtractor struct when the node type is +/// const TensorCwiseNullaryOp, const TensorCwiseUnaryOp, and const TensorBroadcastingOp +template