summaryrefslogtreecommitdiffhomepage
path: root/eigen/unsupported/Eigen/CXX11
diff options
context:
space:
mode:
Diffstat (limited to 'eigen/unsupported/Eigen/CXX11')
-rw-r--r--eigen/unsupported/Eigen/CXX11/Tensor9
-rw-r--r--eigen/unsupported/Eigen/CXX11/ThreadPool13
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/README.md8
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h6
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h2
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h30
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h6
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h287
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h134
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h101
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h76
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h400
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h208
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h3
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h2
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h476
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h7
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h4
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h413
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h13
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h12
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h5
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h11
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h4
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h41
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h14
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h4
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h12
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h8
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h5
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h53
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h7
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h30
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h1
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h179
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h5
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h10
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h8
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h16
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h12
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h54
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h188
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h265
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h322
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h245
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h138
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h57
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h77
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h2
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h6
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h1
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h142
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h7
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h8
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h23
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h2
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h6
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h6
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/util/EmulateArray.h13
59 files changed, 707 insertions, 3490 deletions
diff --git a/eigen/unsupported/Eigen/CXX11/Tensor b/eigen/unsupported/Eigen/CXX11/Tensor
index 3991609..7ecb4c7 100644
--- a/eigen/unsupported/Eigen/CXX11/Tensor
+++ b/eigen/unsupported/Eigen/CXX11/Tensor
@@ -13,14 +13,13 @@
#include "../../../Eigen/Core"
-#if defined(EIGEN_USE_SYCL)
+#ifdef EIGEN_USE_SYCL
#undef min
#undef max
#undef isnan
#undef isinf
#undef isfinite
#include <SYCL/sycl.hpp>
-#include <iostream>
#include <map>
#include <memory>
#include <utility>
@@ -53,10 +52,8 @@ typedef __int32 int32_t;
typedef unsigned __int32 uint32_t;
typedef __int64 int64_t;
typedef unsigned __int64 uint64_t;
-#include <windows.h>
#else
#include <stdint.h>
-#include <unistd.h>
#endif
#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900
@@ -71,10 +68,6 @@ typedef unsigned __int64 uint64_t;
#include <time.h>
#endif
-#if defined(EIGEN_USE_LIBXSMM)
-#include "libxsmm.h"
-#endif
-
#ifdef EIGEN_USE_THREADS
#include "ThreadPool"
#endif
diff --git a/eigen/unsupported/Eigen/CXX11/ThreadPool b/eigen/unsupported/Eigen/CXX11/ThreadPool
index c346141..09d637e 100644
--- a/eigen/unsupported/Eigen/CXX11/ThreadPool
+++ b/eigen/unsupported/Eigen/CXX11/ThreadPool
@@ -50,7 +50,6 @@
#include "src/ThreadPool/ThreadLocal.h"
#include "src/ThreadPool/ThreadYield.h"
-#include "src/ThreadPool/ThreadCancel.h"
#include "src/ThreadPool/EventCount.h"
#include "src/ThreadPool/RunQueue.h"
#include "src/ThreadPool/ThreadPoolInterface.h"
@@ -58,18 +57,6 @@
#include "src/ThreadPool/SimpleThreadPool.h"
#include "src/ThreadPool/NonBlockingThreadPool.h"
-
-// Use the more efficient NonBlockingThreadPool by default.
-namespace Eigen {
-#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
-template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
-typedef NonBlockingThreadPool ThreadPool;
-#else
-template <typename Env> using ThreadPoolTempl = SimpleThreadPoolTempl<Env>;
-typedef SimpleThreadPool ThreadPool;
-#endif
-} // namespace Eigen
-
#endif
#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md b/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md
index 38cdb9c..98e8381 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md
@@ -1737,9 +1737,11 @@ TODO
## Representation of scalar values
-Scalar values are often represented by tensors of size 1 and rank 0.For example
-Tensor<T, N>::maximum() currently returns a Tensor<T, 0>. Similarly, the inner
-product of 2 1d tensors (through contractions) returns a 0d tensor.
+Scalar values are often represented by tensors of size 1 and rank 1. It would be
+more logical and user friendly to use tensors of rank 0 instead. For example
+Tensor<T, N>::maximum() currently returns a Tensor<T, 1>. Similarly, the inner
+product of 2 1d tensors (through contractions) returns a 1d tensor. In the
+future these operations might be updated to return 0d tensors instead.
## Limitations
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index fbe3408..7a45a5c 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -186,12 +186,6 @@ class TensorBase<Derived, ReadOnlyAccessors>
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_expm1_op<Scalar>, const Derived>
- expm1() const {
- return unaryExpr(internal::scalar_expm1_op<Scalar>());
- }
-
- EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived>
log() const {
return unaryExpr(internal::scalar_log_op<Scalar>());
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 23a7446..4cfe300 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -54,7 +54,7 @@ struct is_input_scalar<Sizes<> > {
static const bool value = true;
};
#ifndef EIGEN_EMULATE_CXX11_META_H
-template <typename std::ptrdiff_t... Indices>
+template <typename std::size_t... Indices>
struct is_input_scalar<Sizes<Indices...> > {
static const bool value = (Sizes<Indices...>::total_size == 1);
};
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index c46a778..1ba7ef1 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -50,7 +50,6 @@ template <DenseIndex DimId>
struct DimensionId
{
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) {
- EIGEN_UNUSED_VARIABLE(dim);
eigen_assert(dim == DimId);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {
@@ -151,7 +150,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
- : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device), m_offset(op.offset())
+ : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device)
{
EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
eigen_assert(NumInputDims > m_dim.actualDim());
@@ -207,7 +206,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
- (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
+ (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
// m_stride is equal to 1, so let's avoid the integer division.
eigen_assert(m_stride == 1);
Index inputIndex = index * m_inputStride + m_inputOffset;
@@ -219,7 +218,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
} else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims - 1) ||
- (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
+ (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
// m_stride is aways greater than index, so let's avoid the integer division.
eigen_assert(m_stride > index);
return m_impl.template packet<LoadMode>(index + m_inputOffset);
@@ -275,29 +274,17 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
}
}
- /// used by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex dimId() const {
- return m_dim.actualDim();
- }
-
- /// used by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DenseIndex& offset() const {
- return m_offset;
- }
- /// required by sycl in order to extract the accessor
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
-
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
{
Index inputIndex;
if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
- (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
+ (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
// m_stride is equal to 1, so let's avoid the integer division.
eigen_assert(m_stride == 1);
inputIndex = index * m_inputStride + m_inputOffset;
} else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims-1) ||
- (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
+ (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
// m_stride is aways greater than index, so let's avoid the integer division.
eigen_assert(m_stride > index);
inputIndex = index + m_inputOffset;
@@ -317,9 +304,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
TensorEvaluator<ArgType, Device> m_impl;
const internal::DimensionId<DimId> m_dim;
const Device& m_device;
-// required by sycl
- const DenseIndex m_offset;
-
};
@@ -360,7 +344,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == 0) ||
- (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) {
+ (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) {
// m_stride is equal to 1, so let's avoid the integer division.
eigen_assert(this->m_stride == 1);
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
@@ -371,7 +355,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
inputIndex += this->m_inputStride;
}
} else if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == NumInputDims-1) ||
- (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == 0)) {
+ (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == 0)) {
// m_stride is aways greater than index, so let's avoid the integer division.
eigen_assert(this->m_stride > index);
this->m_impl.template writePacket<StoreMode>(index + this->m_inputOffset, x);
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index 2c7ba96..59bf90d 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -276,12 +276,6 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
}
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
- /// required by sycl in order to extract the accessor
- const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
- /// required by sycl in order to extract the accessor
- const TensorEvaluator<RightArgType, Device>& right_impl() const { return m_rightImpl; }
- /// required by sycl in order to extract the accessor
- const Axis& axis() const { return m_axis; }
protected:
Dimensions m_dimensions;
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index bf4a476..20b29e5 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -20,70 +20,6 @@ namespace Eigen {
*
*/
namespace internal {
-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
-template<typename Scalar, typename Index>
-void pack_simple(Scalar * dst, const Scalar * src, Index cols, Index rows, Index lddst, Index ldsrc) {
- size_t psize = packet_traits<Scalar>::size; // Packet size
- typedef typename packet_traits<Scalar>::type Packet; // Packet type
- size_t alignment = psize*sizeof(Scalar); // Needed alignment
- if (rows % psize == 0 && (lddst*sizeof(Scalar)) % alignment == 0 &&
- (ldsrc*sizeof(Scalar)) % alignment == 0 &&
- reinterpret_cast<uintptr_t>(src) % alignment == 0 &&
- reinterpret_cast<uintptr_t>(dst) % alignment == 0) {
- // Optimized version using packets
- size_t num_packets = rows / psize;
- for (Index col = 0; col < cols; ++col) {
- EIGEN_ASM_COMMENT("begin pack_simple inner copy");
- // Unrolled manually 4 times.
- for (size_t i=0; i < num_packets/4; ++i) {
- internal::pstore(dst, internal::pload<Packet>(src));
- dst += psize; src += psize;
- internal::pstore(dst, internal::pload<Packet>(src));
- dst += psize; src += psize;
- internal::pstore(dst, internal::pload<Packet>(src));
- dst += psize; src += psize;
- internal::pstore(dst, internal::pload<Packet>(src));
- dst += psize; src += psize;
- }
- for (size_t i=0; i < num_packets%4; ++i) {
- internal::pstore(dst, internal::pload<Packet>(src));
- dst += psize; src += psize;
- }
- dst += lddst - num_packets*psize;
- src += ldsrc - num_packets*psize;
- EIGEN_ASM_COMMENT("end pack_simple inner copy");
- }
- } else {
- // Naive memcpy calls
- for (Index col = 0; col < cols; ++col) {
- memcpy(dst + col*lddst, src + col*ldsrc, rows*sizeof(Scalar));
- }
- }
-}
-
-template<typename LhsScalar, typename RhsScalar, typename Scalar>
- struct libxsmm_wrapper {
- libxsmm_wrapper() {}
- libxsmm_wrapper(int, int, int, int, int, int, int, float, float, int) {}
- void operator()(const LhsScalar*, const RhsScalar*, Scalar*) {}
- void operator()(const LhsScalar*, const RhsScalar*, Scalar*, const LhsScalar*, const RhsScalar*, const Scalar*) {}
- };
-
- template<>
- struct libxsmm_wrapper<float, float, float>: public libxsmm_mmfunction<float> {
- libxsmm_wrapper(): libxsmm_mmfunction() {}
- libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) :
- libxsmm_mmfunction(flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch) {}
- };
-
- template<>
- struct libxsmm_wrapper<double, double, double>: public libxsmm_mmfunction<double> {
- libxsmm_wrapper(): libxsmm_mmfunction() {}
- libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) :
- libxsmm_mmfunction(flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch) {}
- };
-#endif
-
template<typename Dimensions, typename LhsXprType, typename RhsXprType>
struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
@@ -222,7 +158,7 @@ struct TensorContractionEvaluatorBase
m_device(device),
m_result(NULL) {
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
- static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
+ static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -381,8 +317,6 @@ struct TensorContractionEvaluatorBase
}
}
- EnableXSMMIfPossible(eval_op_indices);
-
// If the layout is RowMajor, we need to reverse the m_dimensions
if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) {
for (int i = 0, j = NumDims - 1; i < j; i++, j--) {
@@ -393,7 +327,7 @@ struct TensorContractionEvaluatorBase
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar * data) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
m_leftImpl.evalSubExprsIfNeeded(NULL);
m_rightImpl.evalSubExprsIfNeeded(NULL);
if (data) {
@@ -488,13 +422,6 @@ struct TensorContractionEvaluatorBase
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const {
- #if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
- if (m_can_use_xsmm) {
- evalGemmXSMM(buffer);
- return;
- }
- #endif
-
// columns in left side, rows in right side
const Index k = this->m_k_size;
@@ -611,214 +538,7 @@ struct TensorContractionEvaluatorBase
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; }
-protected:
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void EnableXSMMIfPossible(const array<IndexPair<Index>, ContractDims>& eval_op_indices) {
- m_can_use_xsmm = false;
-
-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
- typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
- typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
- if (!std::is_same<Scalar, LhsScalar>::value ||
- !std::is_same<Scalar, RhsScalar>::value ||
- !(std::is_same<Scalar, float>::value ||
- std::is_same<Scalar, double>::value) ||
- m_leftImpl.data() == NULL ||
- m_rightImpl.data() == NULL) {
- return;
- }
-
- // Check if we can use faster matmul algorithms. For contraction to be
- // equivalent to matmul, we need both lhs and rhs contracting dims sequences
- // to be either a prefix or suffix of all dims. Also, the order of both
- // must be the same, so we don't have to do reordering.
- // For example:
- // * OK: lhs 4D, rhs 4D, contraction: [(0, 2), (1, 3)]
- // * BAD: lhs 3D, rhs 3D, contraction: [(1,1)]
- // * BAD: lhs 3D, rhs 3D, contraction: [(0, 0), (2, 2)]
- // * BAD: lhs 3D, rhs 3D, contraction: [(0, 2), (1, 1)]
- // Depending if contraction dims are prefix or suffix of all dims we need to
- // pre-transpose matrices in matmul algorithm:
- // lhs: prefix -> transpose, suffix -> no transpose
- // rhs: prefix -> no transpose, suffix -> transpose
- // For example, for lhs 2D, rhs 2D, contraction [(1, 0)] is regular,
- // non-transposed matmul.
- if (ContractDims == 0) {
- // This case is totally uninteresting, filter it out to avoid problems
- // with iterations in further tests.
- return;
- }
-
- // Check if RHS dims list is increasing. LHS already is, so if not, the
- // order is different and we cannot do matmul.
- for (int i = 1; i < ContractDims; i++) {
- if (eval_op_indices[i].second < eval_op_indices[i-1].second) {
- return;
- }
- }
-
- // Check if no holes.
- int diff;
- for (int i = 1; i < ContractDims; i++) {
- // LHS contract dims are sorted to form an increasing seq.
- diff = eval_op_indices[i].first - eval_op_indices[i-1].first;
- if (diff != 1) {
- return;
- }
- // Now we may already assume RHS contract dims seq is increasing too.
- diff = eval_op_indices[i].second - eval_op_indices[i-1].second;
- if (diff != 1) {
- return;
- }
- }
-
- // Check if suffix or prefix.
- if (eval_op_indices[0].first != 0 &&
- eval_op_indices[ContractDims-1].first != LDims-1) {
- return;
- }
- if (eval_op_indices[0].second != 0 &&
- eval_op_indices[ContractDims-1].second != RDims-1) {
- return;
- }
-
- m_can_use_xsmm = true;
-#else
- EIGEN_UNUSED_VARIABLE(eval_op_indices);
-#endif
- }
-
-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
- EIGEN_DEVICE_FUNC void evalGemmXSMM(Scalar* buffer) const {
- // columns in left side, rows in right side
- const Index k = this->m_k_size;
-
- // rows in left side
- const Index m = this->m_i_size;
-
- // columns in right side
- const Index n = this->m_j_size;
-
- const bool transposeA = !m_lhs_inner_dim_contiguous;
- const bool transposeB = !m_rhs_inner_dim_contiguous;
-
- typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
- typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-
- internal::TensorXsmmContractionBlocking<LhsScalar, RhsScalar, Index> blocking(
- k, m, n, 1, transposeA, transposeB);
-
- // Outer blocks sizes
- const Index mc_outer = blocking.outer_m();
- const Index nc_outer = blocking.outer_n();
- const Index kc_outer = blocking.outer_k();
- // Inner blocks sizes
- const Index mc = blocking.mc();
- const Index nc = blocking.nc();
- const Index kc = blocking.kc();
- // Decisions whether we should copy parts of matrices
- const bool copyA = blocking.copyA();
- const bool copyB = blocking.copyB();
-
- const LhsScalar* leftData = m_leftImpl.data();
- const RhsScalar* rightData = m_rightImpl.data();
-
- const libxsmm_blasint stride_A = static_cast<libxsmm_blasint>(transposeA ? k : m);
- const libxsmm_blasint stride_B = static_cast<libxsmm_blasint>(transposeB ? n : k);
- const libxsmm_blasint stride_C = static_cast<libxsmm_blasint>(m);
-
- const libxsmm_blasint stride_blockA = static_cast<libxsmm_blasint>(mc);
- // Use bigger stride to avoid hitting same cache line too often.
- // This consistently gives +~0.5 Gflops.
- const libxsmm_blasint stride_panelB = static_cast<libxsmm_blasint>(
- kc % 32 == 0 ? kc + 16 : kc
- );
-
- // Kernel for the general case (not edges)
- internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar> kernel;
-
- LhsScalar* blockA = NULL;
- RhsScalar* panelB = NULL;
-
- if (copyA) {
- blockA = static_cast<LhsScalar*>(this->m_device.allocate(mc * kc * sizeof(LhsScalar)));
- }
- if (copyB) {
- panelB = static_cast<RhsScalar*>(this->m_device.allocate(nc_outer * stride_panelB * sizeof(RhsScalar)));
- }
-
- const Index kernel_stride_A = copyA ? stride_blockA : stride_A;
- const Index kernel_stride_B = copyB ? stride_panelB : stride_B;
- kernel = internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(0, mc, nc, kc, kernel_stride_A, kernel_stride_B, stride_C, 1, 1, blocking.prefetch());
-
- // Outer blocking
- for (Index ki_outer = 0; ki_outer < k; ki_outer += kc_outer) {
- for (Index mi_outer = 0; mi_outer < m; mi_outer += mc_outer) {
- for (Index ni_outer = 0; ni_outer < n; ni_outer += nc_outer) {
- using numext::mini;
-
- Index actual_nc_outer = mini(ni_outer+nc_outer, n) - ni_outer;
-
- // Inner blocking
- for (Index ki = ki_outer; ki < mini(ki_outer+kc_outer, k); ki += kc) {
- const Index actual_kc = mini(ki_outer+kc_outer, mini(ki+kc, k)) - ki;
- const float beta = ki == 0 ? 0 : 1;
-
- if (copyB) {
- if (transposeB) {
- libxsmm_otrans(panelB, rightData + ki*stride_B + ni_outer, sizeof(RhsScalar), actual_nc_outer, actual_kc, stride_B, stride_panelB);
- } else {
- internal::pack_simple<RhsScalar, Index>(panelB, rightData + ni_outer*stride_B + ki, actual_nc_outer, actual_kc, stride_panelB, stride_B);
- }
- }
-
- for (Index mi = mi_outer; mi < mini(mi_outer+mc_outer, m); mi += mc) {
- const Index actual_mc = mini(mi_outer+mc_outer, mini(mi+mc, m)) - mi;
-
- const LhsScalar* a = transposeA ? leftData + mi*stride_A + ki :
- leftData + ki*stride_A + mi;
-
- if (copyA) {
- if (transposeA) {
- libxsmm_otrans(blockA, a, sizeof(LhsScalar), actual_kc, actual_mc, stride_A, stride_blockA);
- } else {
- internal::pack_simple<LhsScalar, Index>(blockA, a, actual_kc, actual_mc, stride_blockA, stride_A);
- }
- }
- const LhsScalar* actual_a = copyA ? blockA : a;
-
- for (Index ni = ni_outer; ni < mini(ni_outer+nc_outer, n); ni += nc) {
- const Index actual_nc = mini(ni_outer+nc_outer, mini(ni+nc, n)) - ni;
-
- const RhsScalar* b = rightData + ni*stride_B + ki;
- Scalar* c = buffer + ni*stride_C + mi;
- const Scalar* cp = c + nc*stride_C;
-
- const RhsScalar* actual_b = copyB ? panelB + (ni-ni_outer)*stride_panelB : b;
- const RhsScalar* bp = copyB ? panelB + nc*stride_panelB : b + nc*stride_B;
-
- if (actual_mc == mc && actual_kc == kc && actual_nc == nc && beta == 1) {
- // Most used, cached kernel.
- kernel(actual_a, actual_b, c, actual_a, bp, cp);
- } else {
- // Edges - use libxsmm kernel cache.
- internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(0, actual_mc, actual_nc, actual_kc, kernel_stride_A, kernel_stride_B, stride_C, 1, beta, blocking.prefetch())(actual_a, actual_b, c, actual_a, bp, cp);
- }
- }
- }
- }
- }
- }
- }
-
- if (copyA) {
- this->m_device.deallocate(blockA);
- }
- if (copyB) {
- this->m_device.deallocate(panelB);
- }
- }
-#endif
-
+ protected:
// Prevent assignment
TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&);
Dimensions m_dimensions;
@@ -844,7 +564,6 @@ protected:
TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
const Device& m_device;
Scalar* m_result;
- bool m_can_use_xsmm;
};
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
index d34f9ca..5cf7b4f 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
@@ -50,140 +50,6 @@ class TensorContractionBlocking {
};
-
-#if defined(EIGEN_USE_LIBXSMM)
-template <typename LhsScalar, typename RhsScalar, typename Index>
-class TensorXsmmContractionBlocking {
- public:
- TensorXsmmContractionBlocking(Index k, Index m, Index n,
- size_t max_num_threads = 1, bool transposeA = false,
- bool transposeB = false):
- k_(k), m_(m), n_(n), transposeA_(transposeA),
- transposeB_(transposeB), num_threads_(max_num_threads) {
-#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
- if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
- mc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M;
- kc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K;
- nc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N;
- outer_m_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_M;
- outer_k_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_K;
- outer_n_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_N;
- copyA_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_A;
- copyB_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_B;
- outer_m_ = outer_m_ != 0 ? outer_m_ : m;
- outer_k_ = outer_k_ != 0 ? outer_k_ : k;
- outer_n_ = outer_n_ != 0 ? outer_n_ : n;
- }
-#else
- // Defaults, possibly overriden per-platform.
- copyA_ = true;
- copyB_ = false;
-
- // If the matrix is small enough, don't do blocking, just call single xsmm
- // kernel.
- if (static_cast<double>(m)*k*n <= LIBXSMM_THRESHOLD) {
- mc_ = m; kc_ = k; nc_ = n;
- outer_m_ = m; outer_k_ = k; outer_n_ = n;
- copyA_ = false; copyB_ = false;
- } else {
- int arch = libxsmm_cpuid_x86();
-
- if (arch == LIBXSMM_X86_AVX512_CORE) {
- // skylake
- mc_ = 64; kc_ = 64; nc_ = 24;
- outer_m_ = 512; outer_k_ = 512; outer_n_ = 24*22;
- // Hack to use this kernel architecture as the other one has performance
- // issues (no hardware prefetching).
- // TODO(nishantpatil): This should be removed if the issues are fixed,
- // or this one becomes the default.
- setenv("LIBXSMM_AVX512_CLASSIC_GEMM", "1", 1);
- } else if (arch == LIBXSMM_X86_AVX2) {
- // haswell
- mc_ = 32; kc_ = 192; nc_ = 33;
- outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 33*16;
- } else if (arch == LIBXSMM_X86_AVX) {
- // ivybridge
- mc_ = 32; kc_ = 192; nc_ = 48;
- outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 48*11;
- } else {
- // generic kernel size, usually performing well
- mc_ = 32; kc_ = 128; nc_ = 32;
- outer_m_ = 512; outer_k_ = 512; outer_n_ = 512;
- }
-
- // Only copy if it makes the stride smaller.
- copyA_ = copyA_ && (m > mc_);
- copyB_ = copyB_ && (k > kc_);
- }
-
- // We need to copy anyway if transposing
- copyA_ = copyA_ || transposeA;
- copyB_ = copyB_ || transposeB;
-
- // See libxsmm_gemm_prefetch_type definition in libxsmm_typedefs.h
- prefetch_ = LIBXSMM_PREFETCH_AL2CL2BL2_VIA_C;
-
-#endif
-
- mc_ = mc_ > m ? m : mc_;
- nc_ = nc_ > n ? n : nc_;
- kc_ = kc_ > k ? k : kc_;
-
- size_t compute_parallelism = (m / mc_) * (n / nc_);
- size_t pack_parallelism = 0;
- if (copyA_) {
- pack_parallelism += (m / mc_) * (k / kc_);
- }
- if (copyB_) {
- pack_parallelism += (n / nc_) * (k / kc_);
- }
- size_t parallelism = numext::maxi(compute_parallelism, pack_parallelism);
-
- num_threads_ = numext::mini<size_t>(num_threads_,
- parallelism / MIN_JOBS_PER_THREAD);
- num_threads_ = numext::maxi<size_t>(num_threads_, 1);
-
- // For optimal performance outer block sizes should be multiplies of kernel
- // sizes, or bigger than matrix size (=no outer blocking).
- eigen_assert(outer_m_ % mc_ == 0 || outer_m_ >= m);
- eigen_assert(outer_k_ % kc_ == 0 || outer_k_ >= k);
- eigen_assert(outer_n_ % nc_ == 0 || outer_n_ >= n);
- }
-
- EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
- EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
- EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
- EIGEN_ALWAYS_INLINE Index outer_k() const { return outer_k_; }
- EIGEN_ALWAYS_INLINE Index outer_m() const { return outer_m_; }
- EIGEN_ALWAYS_INLINE Index outer_n() const { return outer_n_; }
- EIGEN_ALWAYS_INLINE bool copyA() const { return copyA_; }
- EIGEN_ALWAYS_INLINE bool copyB() const { return copyB_; }
- EIGEN_ALWAYS_INLINE bool transposeA() const { return transposeA_; }
- EIGEN_ALWAYS_INLINE bool transposeB() const { return transposeB_; }
- EIGEN_ALWAYS_INLINE int num_threads() const { return num_threads_; }
- EIGEN_ALWAYS_INLINE Index blocks_m() const { return divup(m_, mc_); }
- EIGEN_ALWAYS_INLINE Index blocks_k() const { return divup(k_, kc_); }
- EIGEN_ALWAYS_INLINE Index blocks_n() const { return divup(n_, nc_); }
- EIGEN_ALWAYS_INLINE libxsmm_gemm_prefetch_type prefetch() const {
- return prefetch_;
- }
-
- private:
- Index k_, m_, n_;
- Index kc_, mc_, nc_;
- Index outer_k_, outer_m_, outer_n_;
- bool copyA_, copyB_, transposeA_, transposeB_;
- size_t num_threads_;
-
- // Threshold for m*k*n to skip blocking and just call libxsmm
- const double LIBXSMM_THRESHOLD = 80*80*80;
- // For computing optimal number of threads - so that each thread gets at least
- // that many jobs.
- const double MIN_JOBS_PER_THREAD = 3;
- libxsmm_gemm_prefetch_type prefetch_;
-};
-#endif // EIGEN_USE_LIBXSMM
-
} // end namespace internal
} // end namespace Eigen
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index c04b784..d65dbb4 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -529,6 +529,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
float2 rhs_shmem2[][8], const Index m_size,
const Index n_size, const Index k_size,
const Index base_m, const Index base_n) {
+ typedef float Scalar;
// prefetch registers
float4 lhs_pf0, rhs_pf0;
@@ -539,27 +540,27 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
}
-#define prefetch_lhs(reg, row, col) \
- if (!CHECK_LHS_BOUNDARY) { \
- if (col < k_size) { \
- reg =lhs.template loadPacket<Unaligned>(row, col); \
- } \
- } else { \
- if (col < k_size) { \
- if (row + 3 < m_size) { \
- reg =lhs.template loadPacket<Unaligned>(row, col); \
- } else if (row + 2 < m_size) { \
- reg.x =lhs(row + 0, col); \
- reg.y =lhs(row + 1, col); \
- reg.z =lhs(row + 2, col); \
- } else if (row + 1 < m_size) { \
- reg.x =lhs(row + 0, col); \
- reg.y =lhs(row + 1, col); \
- } else if (row < m_size) { \
- reg.x =lhs(row + 0, col); \
- } \
- } \
- } \
+#define prefetch_lhs(reg, row, col) \
+ if (!CHECK_LHS_BOUNDARY) { \
+ if (col < k_size) { \
+ reg =lhs.loadPacket<Unaligned>(row, col); \
+ } \
+ } else { \
+ if (col < k_size) { \
+ if (row + 3 < m_size) { \
+ reg =lhs.loadPacket<Unaligned>(row, col); \
+ } else if (row + 2 < m_size) { \
+ reg.x =lhs(row + 0, col); \
+ reg.y =lhs(row + 1, col); \
+ reg.z =lhs(row + 2, col); \
+ } else if (row + 1 < m_size) { \
+ reg.x =lhs(row + 0, col); \
+ reg.y =lhs(row + 1, col); \
+ } else if (row < m_size) { \
+ reg.x =lhs(row + 0, col); \
+ } \
+ } \
+ } \
Index lhs_vert = base_m+threadIdx.x*4;
@@ -577,7 +578,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
if (!CHECK_RHS_BOUNDARY) {
if ((rhs_vert + 3) < k_size) {
// just CHECK_RHS_BOUNDARY
- rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+ rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
} else if (rhs_vert + 2 < k_size) {
// just CHECK_RHS_BOUNDARY
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -592,7 +593,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
} else {
if (rhs_horiz0 < n_size) {
if ((rhs_vert + 3) < k_size) {
- rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+ rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
} else if ((rhs_vert + 2) < k_size) {
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
@@ -765,6 +766,7 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
float2 rhs_shmem2[][8], const Index m_size,
const Index n_size, const Index k_size,
const Index base_m, const Index base_n) {
+ typedef float Scalar;
// prefetch registers
float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
@@ -788,37 +790,37 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
if (!CHECK_LHS_BOUNDARY) {
if ((threadIdx.y/4+k+24) < k_size) {
- lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
- lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
- lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
- lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
+ lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+ lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
} else if ((threadIdx.y/4+k+16) < k_size) {
- lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
- lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
- lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+ lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
} else if ((threadIdx.y/4+k+8) < k_size) {
- lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
- lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
} else if ((threadIdx.y/4+k) < k_size) {
- lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
}
} else {
// just CHECK_LHS_BOUNDARY
if (lhs_vert + 3 < m_size) {
if ((threadIdx.y/4+k+24) < k_size) {
- lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
- lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
- lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
- lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
+ lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+ lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
} else if ((threadIdx.y/4+k+16) < k_size) {
- lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
- lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
- lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+ lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
} else if ((threadIdx.y/4+k+8) < k_size) {
- lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
- lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
} else if ((threadIdx.y/4+k) < k_size) {
- lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
}
} else if (lhs_vert + 2 < m_size) {
if ((threadIdx.y/4+k+24) < k_size) {
@@ -907,8 +909,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
if (!CHECK_RHS_BOUNDARY) {
if ((rhs_vert + 3) < k_size) {
// just CHECK_RHS_BOUNDARY
- rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
- rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
+ rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+ rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
} else if (rhs_vert + 2 < k_size) {
// just CHECK_RHS_BOUNDARY
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -930,8 +932,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
if (rhs_horiz1 < n_size) {
if ((rhs_vert + 3) < k_size) {
// just CHECK_RHS_BOUNDARY
- rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
- rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
+ rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+ rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
} else if (rhs_vert + 2 < k_size) {
// just CHECK_RHS_BOUNDARY
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -952,7 +954,7 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
} else if (rhs_horiz0 < n_size) {
if ((rhs_vert + 3) < k_size) {
// just CHECK_RHS_BOUNDARY
- rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+ rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
} else if ((rhs_vert + 2) < k_size) {
// just CHECK_RHS_BOUNDARY
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -1135,6 +1137,9 @@ EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
typedef float2 LHS_MEM[64][32];
typedef float2 RHS_MEM[128][8];
+ typedef float2 LHS_MEM16x16[32][16];
+ typedef float2 RHS_MEM16x16[64][8];
+
const Index m_block_idx = blockIdx.x;
const Index n_block_idx = blockIdx.y;
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
index ab320a5..9b2cb3f 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@@ -22,14 +22,8 @@ enum {
/*
* Implementation of the Eigen blas_data_mapper class for tensors.
*/
-/// The make pointer class is used by sycl in order to build the mapper class on the device. For other platform the default make pointer is used which
-/// is scalar * for CoeffLoader.
-template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_ = MakePointer> struct CoeffLoader;
-template<typename Scalar, typename Index, int side, typename Tensor, typename nocontract_t, typename contract_t,
- int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
- template <class> class MakePointer_ = MakePointer> class BaseTensorContractionMapper;
-
-template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_> struct CoeffLoader {
+
+template <typename Tensor, bool HasRawAccess> struct CoeffLoader {
enum {
DirectOffsets = false
};
@@ -53,7 +47,7 @@ template <typename Tensor, bool HasRawAccess, template <class> class MakePointer
const Tensor m_tensor;
};
-template <typename Tensor, template <class> class MakePointer_> struct CoeffLoader<Tensor, true, MakePointer_> {
+template <typename Tensor> struct CoeffLoader<Tensor, true> {
enum {
DirectOffsets = true
};
@@ -73,14 +67,13 @@ template <typename Tensor, template <class> class MakePointer_> struct CoeffLoad
}
private:
typedef typename Tensor::Scalar Scalar;
-
- typename MakePointer_<const Scalar>::Type m_data;
+ const Scalar* m_data;
};
template<typename Scalar, typename Index, int side,
typename Tensor,
typename nocontract_t, typename contract_t,
- int packet_size, bool inner_dim_contiguous, int Alignment, template <class> class MakePointer_ = MakePointer>
+ int packet_size, bool inner_dim_contiguous, int Alignment>
class SimpleTensorContractionMapper {
public:
EIGEN_DEVICE_FUNC
@@ -96,7 +89,7 @@ class SimpleTensorContractionMapper {
m_k_strides(k_strides) { }
enum {
- DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>::DirectOffsets
+ DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess>::DirectOffsets
};
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) {
@@ -213,22 +206,23 @@ class SimpleTensorContractionMapper {
}
protected:
- CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_> m_tensor;
+ CoeffLoader<Tensor, Tensor::RawAccess> m_tensor;
const nocontract_t m_nocontract_strides;
const nocontract_t m_ij_strides;
const contract_t m_contract_strides;
const contract_t m_k_strides;
};
+
template<typename Scalar, typename Index, int side,
typename Tensor,
typename nocontract_t, typename contract_t,
int packet_size, bool inner_dim_contiguous,
- bool inner_dim_reordered, int Alignment, template <class> class MakePointer_>
-class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment, MakePointer_>
+ bool inner_dim_reordered, int Alignment>
+class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment>
{
public:
- typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment, MakePointer_> ParentMapper;
+ typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment> ParentMapper;
EIGEN_DEVICE_FUNC
BaseTensorContractionMapper(const Tensor& tensor,
@@ -241,9 +235,9 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar,
typedef typename Tensor::PacketReturnType Packet;
typedef typename unpacket_traits<Packet>::half HalfPacket;
- template <typename PacketT,int AlignmentType>
+ template <int AlignmentType>
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const {
+ EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
// whole method makes column major assumption
// don't need to add offsets for now (because operator handles that)
@@ -281,13 +275,7 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar,
}
data[packet_size - 1] = this->m_tensor.coeff(last);
- return pload<PacketT>(data);
- }
-
- template <int AlignmentType>
- EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
- return this->load<Packet,AlignmentType>(i,j);
+ return pload<Packet>(data);
}
template <int AlignmentType>
@@ -313,11 +301,11 @@ template<typename Scalar, typename Index, int side,
typename Tensor,
typename nocontract_t, typename contract_t,
bool inner_dim_contiguous,
- bool inner_dim_reordered, int Alignment, template <class> class MakePointer_>
-class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_>
+ bool inner_dim_reordered, int Alignment>
+class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment>
{
public:
- typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_> ParentMapper;
+ typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment> ParentMapper;
EIGEN_DEVICE_FUNC
BaseTensorContractionMapper(const Tensor& tensor,
@@ -334,12 +322,6 @@ class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, con
data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
return pload<typename Tensor::PacketReturnType>(data);
}
- template <typename PacketT,int> EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const {
- EIGEN_ALIGN_MAX Scalar data[1];
- data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
- return pload<PacketT>(data);
- }
template <int> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const {
return loadPacket(i, j);
@@ -351,14 +333,14 @@ template<typename Scalar, typename Index, int side,
typename Tensor,
typename nocontract_t, typename contract_t,
int packet_size,
- bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, template <class> class MakePointer_=MakePointer>
+ bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
class TensorContractionSubMapper {
public:
typedef typename Tensor::PacketReturnType Packet;
typedef typename unpacket_traits<Packet>::half HalfPacket;
- typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> ParentMapper;
- typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Self;
+ typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper;
+ typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
typedef Self LinearMapper;
enum {
@@ -403,14 +385,6 @@ class TensorContractionSubMapper {
return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, j + m_horiz_offset);
}
- template <typename PacketT, int AlignmentType>
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const {
- if (UseDirectOffsets) {
- return m_base_mapper.template load<PacketT,AlignmentType>(i, j);
- }
- return m_base_mapper.template loadPacket<PacketT,AlignmentType>(i + m_vert_offset, j + m_horiz_offset);
- }
-
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
if (UseDirectOffsets) {
return m_base_mapper.template loadHalfPacket<Alignment>(i, 0);
@@ -418,7 +392,7 @@ class TensorContractionSubMapper {
return m_base_mapper.template loadHalfPacket<Alignment>(i + m_vert_offset, m_horiz_offset);
}
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet& p) const {
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
if (UseDirectOffsets) {
m_base_mapper.storePacket(i, 0, p);
}
@@ -458,14 +432,14 @@ template<typename Scalar_, typename Index, int side,
typename Tensor,
typename nocontract_t, typename contract_t,
int packet_size,
- bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, template <class> class MakePointer_=MakePointer>
+ bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
class TensorContractionInputMapper
- : public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> {
+ : public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> {
public:
typedef Scalar_ Scalar;
- typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Base;
- typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> SubMapper;
+ typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Base;
+ typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
typedef SubMapper VectorMapper;
EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor,
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
deleted file mode 100644
index e87de0c..0000000
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ /dev/null
@@ -1,400 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli Codeplay Software Ltd.
-// Ralph Potter Codeplay Software Ltd.
-// Luke Iwanski Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclConvertToDeviceExpression.h
- *
- * \brief:
- * TensorContractionsycl
- *
-*****************************************************************/
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
-namespace Eigen {
-
-template <typename Index, typename LhsScalar, typename RhsScalar,bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels;
-template<typename Indices, typename LeftArgType, typename RightArgType>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, const Eigen::SyclDevice> :
- public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, const Eigen::SyclDevice> > {
-
- typedef const Eigen::SyclDevice Device;
-
- typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
- typedef TensorContractionEvaluatorBase<Self> Base;
- typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
- typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
- typedef typename XprType::Index Index;
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
- enum {
- Layout = TensorEvaluator<LeftArgType, Device>::Layout,
- };
-
- // Most of the code is assuming that both input tensors are ColMajor. If the
- // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
- // If we want to compute A * B = C, where A is LHS and B is RHS, the code
- // will pretend B is LHS and A is RHS.
- typedef typename internal::conditional<
- static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
- typedef typename internal::conditional<
- static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
-
- static const int LDims =
- internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
- static const int RDims =
- internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
- static const int ContractDims = internal::array_size<Indices>::value;
-
- typedef array<Index, LDims> left_dim_mapper_t;
- typedef array<Index, RDims> right_dim_mapper_t;
-
- typedef array<Index, ContractDims> contract_t;
- typedef array<Index, LDims - ContractDims> left_nocontract_t;
- typedef array<Index, RDims - ContractDims> right_nocontract_t;
-
- static const int NumDims = LDims + RDims - 2 * ContractDims;
-
- typedef DSizes<Index, NumDims> Dimensions;
-
- // typedefs needed in evalTo
- typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
- typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-
- typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
- typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
-
- typedef typename LeftEvaluator::Dimensions LeftDimensions;
- typedef typename RightEvaluator::Dimensions RightDimensions;
-
- EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
- Base(op, device) {}
-
- // We need to redefine this method to make nvcc happy
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
- this->m_leftImpl.evalSubExprsIfNeeded(NULL);
- this->m_rightImpl.evalSubExprsIfNeeded(NULL);
- if (data) {
- evalTo(data);
- return false;
- } else {
- this->m_result = static_cast<Scalar*>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
- evalTo(this->m_result);
- return true;
- }
- }
- const Eigen::SyclDevice& device() const {return this->m_device;}
- void evalTo(Scalar* buffer) const {
- // Here is the result
- if (this->m_lhs_inner_dim_contiguous) {
- if (this->m_rhs_inner_dim_contiguous) {
- if (this->m_rhs_inner_dim_reordered) {
- evalTyped<true, true, true, Unaligned>(buffer);
- }
- else {
- evalTyped<true, true, false, Unaligned>(buffer);
- }
- }
- else {
- if (this->m_rhs_inner_dim_reordered) {
- evalTyped<true, false, true, Unaligned>(buffer);
- }
- else {
- evalTyped<true, false, false, Unaligned>(buffer);
- }
- }
- }
- else {
- if (this->m_rhs_inner_dim_contiguous) {
- if (this->m_rhs_inner_dim_reordered) {
- evalTyped<false, true, true, Unaligned>(buffer);
- }
- else {
- evalTyped<false, true, false, Unaligned>(buffer);
- }
- }
- else {
- if (this->m_rhs_inner_dim_reordered) {
- evalTyped<false, false, true, Unaligned>(buffer);
- }
- else {
- evalTyped<false, false, false, Unaligned>(buffer);
- }
- }
- }
- }
-
- template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
- void evalTyped(Scalar* buffer) const {
- // columns in left side, rows in right side
- const Index k = this->m_k_size;
- EIGEN_UNUSED_VARIABLE(k)
- // rows in left side
- const Index m = this->m_i_size;
- // columns in right side
- const Index n = this->m_j_size;
-
- // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
- this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
- LaunchSyclKernels<Index, LhsScalar, RhsScalar,lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>::Run(*this, buffer, m, n, k,
- this->m_k_strides, this->m_left_contracting_strides, this->m_right_contracting_strides,
- this->m_i_strides, this->m_j_strides, this->m_left_nocontract_strides, this->m_right_nocontract_strides);
- }
- // required by sycl to construct the expr on the device. Returns original left_impl
- const TensorEvaluator<LeftArgType, Device>& left_impl() const {
- return choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), this->m_leftImpl, this->m_rightImpl);
- }
- // required by sycl to construct the expr on the device. Returns original right_impl
- const TensorEvaluator<RightArgType, Device>& right_impl() const {
- return choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), this->m_rightImpl, this->m_leftImpl);
- }
-};
-
-template <typename HostExpr, typename OutScalar, typename LhsScalar, typename RhsScalar, typename LHSFunctorExpr, typename RHSFunctorExpr, typename LhsLocalAcc, typename RhsLocalAcc, typename OutAccessor, typename Index, typename ContractT, typename LeftNocontractT,
-typename RightNocontractT, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered,
-typename HostExpr::Index TileSizeDimM, typename HostExpr::Index TileSizeDimN,typename HostExpr::Index TileSizeDimK, typename HostExpr::Index WorkLoadPerThreadM,typename HostExpr::Index WorkLoadPerThreadN,
-typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadSizeN, typename HostExpr::Index LoadPerThreadLhs, typename HostExpr::Index LoadPerThreadRhs, typename LHSTupleType, typename RHSTupleType, typename Device> struct KernelConstructor{
- typedef typename Eigen::internal::traits<HostExpr>::_LhsNested LHSHostExpr;
- typedef typename Eigen::internal::traits<HostExpr>::_RhsNested RHSHostExpr;
- typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<LHSHostExpr>::Type LHSPlaceHolderExpr;
- typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<RHSHostExpr>::Type RHSPlaceHolderExpr;
- LHSFunctorExpr lhs_functors;
- RHSFunctorExpr rhs_functors;
- LhsLocalAcc localLhs;
- RhsLocalAcc localRhs;
- OutAccessor out_res;
- Index roundUpK, M, N, K;
- ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides;
- LeftNocontractT m_i_strides, m_left_nocontract_strides;
- RightNocontractT m_j_strides, m_right_nocontract_strides;
- LHSTupleType left_tuple_of_accessors;
- RHSTupleType right_tuple_of_accessors;
- Device dev;
-
-
- KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_,
- Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_,
- ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_,
- LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, LHSTupleType left_tuple_of_accessors_, RHSTupleType right_tuple_of_accessors_, Device dev_)
- :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_),
- m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_),
- m_right_contracting_strides(m_right_contracting_strides_),
- m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_),
- m_j_strides(m_j_strides_), m_right_nocontract_strides(m_right_nocontract_strides_),
- left_tuple_of_accessors(left_tuple_of_accessors_), right_tuple_of_accessors(right_tuple_of_accessors_), dev(dev_){}
-
- void operator()(cl::sycl::nd_item<1> itemID) {
- typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
- typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<LHSHostExpr>::Type LHSDevExpr;
- typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<RHSHostExpr>::Type RHSDevExpr;
- auto lhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression<LHSDevExpr, LHSPlaceHolderExpr>(lhs_functors, left_tuple_of_accessors);
- auto rhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression<RHSDevExpr, RHSPlaceHolderExpr>(rhs_functors, right_tuple_of_accessors);
- typedef decltype(lhs_dev_expr.expr) LeftArgType;
- typedef decltype(rhs_dev_expr.expr) RightArgType;
- typedef typename internal::conditional<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
- typedef typename internal::conditional<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
- typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
- typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
- typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
- LeftEvaluator, LeftNocontractT,
- ContractT, 1,
- lhs_inner_dim_contiguous,
- false, Unaligned, MakeGlobalPointer> LhsMapper;
-
- typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
- RightEvaluator, RightNocontractT,
- ContractT, 1,
- rhs_inner_dim_contiguous,
- rhs_inner_dim_reordered, Unaligned, MakeGlobalPointer> RhsMapper;
- // initialize data mappers must happen inside the kernel for device eval
- LhsMapper lhs(LeftEvaluator(choose(Cond<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor)>(),
- lhs_dev_expr.expr, rhs_dev_expr.expr), dev), m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides);
- RhsMapper rhs(RightEvaluator(choose(Cond<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor)>(),
- rhs_dev_expr.expr, lhs_dev_expr.expr),dev), m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides);
- auto out_ptr = ConvertToActualTypeSycl(OutScalar, out_res);
- // Matmul Kernel
- // Thread identifiers
- const Index mLocalThreadId = itemID.get_local(0); // Local ID row
- const Index nLocalThreadId = itemID.get_local(1); // Local ID col
- const Index mGroupId = itemID.get_group(0); // Work-group ID row
- const Index nGroupId = itemID.get_group(1); // Work-group ID localCol
- const Index linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID
- // Allocate register space
- float privateLhs;
- float privateRhs[WorkLoadPerThreadN];
- float privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN];
- // Initialise the privateResumulation registers
- for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
- for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
- privateRes[wLPTM][wLPTN] = 0.0f;
- }
- }
-
- // Tile Lhs
- for (Index lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
- Index localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
- Index localLhsRow = localLhsLinearId% TileSizeDimM;
- Index localLhsCol = localLhsLinearId/TileSizeDimM;
- // Load the value (wide vector load)
- Index GlobalLhsColId = TileSizeDimK*0 + localLhsCol;
- localLhs[0 + ((localLhsCol*TileSizeDimM + localLhsRow)*2)] =((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId):static_cast<OutScalar>(0);
- }
- // Tile Rhs
- for (Index lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
- Index localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
- Index localRhsRow = localRhsLinearId% TileSizeDimN;
- Index localRhsCol = localRhsLinearId/TileSizeDimN;
- // Load the value (wide vector load)
- Index GlobalRhsRowId = TileSizeDimK*0 + localRhsCol;
- localRhs[0 + ((localRhsCol*TileSizeDimN + localRhsRow) *2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow): static_cast<OutScalar>(0);
-
- }
- // Loop over all tiles
- const Index numTiles = roundUpK/TileSizeDimK;
- Index firstHalf=0;
- do {
- // Synchronise
- itemID.barrier(cl::sycl::access::fence_space::local_space);
- // Load the next tile of Lhs and Rhs into local memory
- Index nextHalf = firstHalf + 1;
- if (nextHalf < numTiles) {
- // Tile A
- for (Index lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
- Index localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
- Index localLhsRow = localLhsLinearId% TileSizeDimM;
- Index localLhsCol = localLhsLinearId/TileSizeDimM;
- // global K id
- Index GlobalLhsColId = TileSizeDimK*nextHalf + localLhsCol;
- // Store the loaded value into local memory
- localLhs[(nextHalf%2) + ((localLhsCol*TileSizeDimM + localLhsRow) *2)] = ((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId): static_cast<OutScalar>(0);
- }
- // Tile B
- for (Index lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
- Index localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
- Index localRhsRow = localRhsLinearId% TileSizeDimN;
- Index localRhsCol = localRhsLinearId/TileSizeDimN;
- // Load the value (wide vector load)
- Index GlobalRhsRowId = TileSizeDimK*nextHalf + localRhsCol;
- // Store the loaded vector into local memory
- localRhs[(nextHalf%2) +((localRhsCol*TileSizeDimN + localRhsRow)*2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow):static_cast<OutScalar>(0);
- }
- }
- // Loop over the values of a single tile
- for (Index k=0; k<TileSizeDimK; k++) {
- // Cache the values of localRhs in registers
- for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
- Index localRhsCol = nLocalThreadId + wLPTN*LocalThreadSizeN;
- privateRhs[wLPTN] = localRhs[(firstHalf%2) +((k*TileSizeDimN + localRhsCol)*2)];
- }
- // Perform the computation
- for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
- Index localLhsRow = mLocalThreadId + wLPTM*LocalThreadSizeM;
- privateLhs = localLhs[(firstHalf%2)+ ((k*TileSizeDimM + localLhsRow)*2)];
- for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
- privateRes[wLPTM][wLPTN] += privateLhs * privateRhs[wLPTN];
- }
- }
- }
- // Next tile
- firstHalf++;
- } while (firstHalf<numTiles);
-
- // Store the final results in C
- for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
- Index globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM;
- if (globalRow< M){
- for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
- Index globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN;
- if(globalCol<N)
- out_ptr[globalCol*M + globalRow] = privateRes[wLPTM][wLPTN];
- }
- }
- }
-
- }
-
-};
-template <typename Index, typename LhsScalar, typename RhsScalar, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels {
-
-static const Index TileSizeDimM = 32ul; // Tile size for dimension M
-static const Index TileSizeDimN = 32ul; // Tile size for dimension N
-static const Index TileSizeDimK = 16ul; // Tile size for dimension K
-static const Index WorkLoadPerThreadM = 4ul; // Work load per thread in dimension M
-static const Index WorkLoadPerThreadN = 4ul; // work load per thread in dimension N
-static const Index LocalThreadSizeM = (TileSizeDimM/WorkLoadPerThreadM); // Local thread size for the first dimension (M here)
-static const Index LocalThreadSizeN = (TileSizeDimN/WorkLoadPerThreadN); // Local thread size for the second dimension (N here)
-static const Index LoadPerThreadLhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimN)); // workload per thread for Lhs expression
-static const Index LoadPerThreadRhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimM)); // workload per thread for Rhs expression
-
-// RoundUp function to make sure that the global threadId is divisable by local threadId
-static Index RoundUp(Index x, Index y) {
- return ((((x) + (y) - 1) / (y))*(y));
-}
-
-template< typename Self, typename OutScalar, typename ContractT, typename LeftNocontractT, typename RightNocontractT>
- static void Run(const Self& self, OutScalar* buffer, Index M, Index N, Index K,
- ContractT m_k_strides, ContractT m_left_contracting_strides, ContractT m_right_contracting_strides,
- LeftNocontractT m_i_strides, RightNocontractT m_j_strides, LeftNocontractT m_left_nocontract_strides, RightNocontractT m_right_nocontract_strides){
-
- typedef typename Self::XprType HostExpr;
- typedef typename Eigen::internal::traits<HostExpr>::_LhsNested LHSHostExpr;
- typedef typename Eigen::internal::traits<HostExpr>::_RhsNested RHSHostExpr;
- typedef TensorEvaluator<LHSHostExpr, const Eigen::SyclDevice> OrigLHSExpr;
- typedef TensorEvaluator<RHSHostExpr, const Eigen::SyclDevice> OrigRHSExpr;
- typedef Eigen::TensorSycl::internal::FunctorExtractor<OrigLHSExpr> LHSFunctorExpr;
- typedef Eigen::TensorSycl::internal::FunctorExtractor<OrigRHSExpr> RHSFunctorExpr;
- // extract lhs functor list
- LHSFunctorExpr lhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl());
- // extract rhs functor list
- RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl());
-
- Index roundUpK = RoundUp(K, TileSizeDimK);
- Index roundUpM = RoundUp(M, TileSizeDimM);
- Index roundUpN = RoundUp(N, TileSizeDimN);
-
- self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) {
- /// work-around for gcc bug
- typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl())) LHSTupleType;
- /// work-around for gcc bug
- typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigRHSExpr>(cgh, self.right_impl())) RHSTupleType;
- // create lhs tuple of accessors
- LHSTupleType left_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl());
- // create rhs tuple of accessors
- RHSTupleType right_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<OrigRHSExpr>(cgh, self.right_impl());
-
- // Local memory for elements of Lhs
- typedef cl::sycl::accessor<LhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> LhsLocalAcc;
- LhsLocalAcc localLhs(cl::sycl::range<1>(2* TileSizeDimM * TileSizeDimK), cgh);
- // Local memory for elements of Rhs
- typedef cl::sycl::accessor<RhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> RhsLocalAcc;
- RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh);
-
- typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> OutAccessor;
- //OutScalar memory
- OutAccessor out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer);
-
- // sycl parallel for
- cgh.parallel_for(cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN),
- cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)),
- KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, LHSFunctorExpr, RHSFunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT,
- RightNocontractT, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, TileSizeDimM, TileSizeDimN, TileSizeDimK,
- WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::DefaultDevice>(lhs_functors, rhs_functors,
- localLhs, localRhs, out_res, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides,
- m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::DefaultDevice()));
- });
- self.device().asynchronousExec();
- }
-};
-
-} // end namespace Eigen
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index d30cc96..ee16cde 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -116,28 +116,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
bool rhs_inner_dim_reordered, int Alignment>
void evalProduct(Scalar* buffer) const {
- const Index m = this->m_i_size;
- const Index n = this->m_j_size;
- const Index k = this->m_k_size;
- if (m == 0 || n == 0 || k == 0) return;
-
-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
- if (this->m_can_use_xsmm) {
- bool transposeA = !this->m_lhs_inner_dim_contiguous;
- bool transposeB = !this->m_rhs_inner_dim_contiguous;
- internal::TensorXsmmContractionBlocking<LhsScalar, RhsScalar, Index>
- blocking(k, m, n, this->m_device.numThreads(), transposeA,
- transposeB);
-
- if (blocking.num_threads() == 1) {
- this->evalGemmXSMM(buffer);
- } else {
- ContextXsmm<Alignment>(this, buffer, m, n, k, blocking).run();
- }
- return;
- }
-#endif
-
typedef
typename internal::remove_const<typename EvalLeftArgType::Scalar>::type
LhsScalar;
@@ -169,7 +147,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
Traits::mr, Traits::nr, false, false>
GebpKernel;
-
+ const Index m = this->m_i_size;
+ const Index n = this->m_j_size;
+ const Index k = this->m_k_size;
+ if (m == 0 || n == 0 || k == 0) return;
// Compute a set of algorithm parameters:
// - kernel block sizes (bm, bn, bk)
@@ -1063,187 +1044,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
rhsCost.dropMemoryCost();
return cost + lhsCost + rhsCost;
}
-
-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
- template<int Alignment>
- class ContextXsmm {
- public:
- ContextXsmm(const Self* self, Scalar* buffer, Index m, Index n, Index k,
- const internal::TensorXsmmContractionBlocking<LhsScalar,
- RhsScalar, Index>& blocking):
- device(self->m_device),
- m(m), k(k), n(n),
- stride_a(blocking.transposeA() ? k : m),
- stride_b(blocking.transposeB() ? n : k),
- stride_c(m),
- bm(blocking.mc()), bk(blocking.kc()), bn(blocking.nc()),
- blocks_m(blocking.blocks_m()), blocks_k(blocking.blocks_k()),
- blocks_n(blocking.blocks_n()),
- copyA(blocking.copyA()), copyB(blocking.copyB()),
- transposeA(blocking.transposeA()), transposeB(blocking.transposeB()),
- num_threads(blocking.num_threads()),
- buffer(buffer),
- leftData(self->m_leftImpl.data()), rightData(self->m_rightImpl.data()),
- workers_done(blocking.num_threads()),
-
- packingA_jobs(0), packingB_jobs(0), compute_jobs(0),
- packingA_done(blocking.blocks_m()), packingB_done(blocking.blocks_n()) {}
-
- void worker() {
- // Pack
-
- if (copyA) {
- while (true) {
- uint32_t mk = packingA_jobs++;
- Index mi = mk / blocks_k;
- Index ki = mk % blocks_k;
- if (mi >= blocks_m) break;
-
- LhsScalar * blockA = blocksA + (bk*bm) * (mi*blocks_k+ki);
- if (transposeA) {
- const LhsScalar * current_a = leftData + (bm*mi)*stride_a + (bk*ki);
- libxsmm_otrans(blockA, current_a, sizeof(LhsScalar), actual_bk(ki),
- actual_bm(mi), stride_a, bm);
- } else {
- const LhsScalar * current_a = leftData + (bk*ki)*stride_a + (bm*mi);
- internal::pack_simple<LhsScalar, Index>(blockA, current_a,
- actual_bk(ki), actual_bm(mi), bm, stride_a);
- }
- packingA_done.at(mi)++;
- }
- }
-
- if (copyB) {
- while (true) {
- uint32_t nk = packingB_jobs++;
- Index ni = nk / blocks_k;
- Index ki = nk % blocks_k;
- if (ni >= blocks_n) break;
-
- RhsScalar * blockB = blocksB + (bk*bn) * (ni*blocks_k+ki);
- if (transposeB) {
- const RhsScalar * current_b = rightData + (ki*bk)*stride_b +
- (ni*bn);
- libxsmm_otrans(blockB, current_b, sizeof(RhsScalar), actual_bn(ni),
- actual_bk(ki), stride_b, bk);
- } else {
- const RhsScalar * current_b = rightData + (ni*bn)*stride_b +
- (ki*bk);
- internal::pack_simple<RhsScalar, Index>(blockB, current_b,
- actual_bn(ni), actual_bk(ki), bk, stride_b);
- }
- packingB_done.at(ni)++;
- }
- }
-
- // Compute
-
- while (true) {
- uint32_t mn = compute_jobs++;
- Index mi = mn / blocks_n;
- Index ni = mn % blocks_n;
- if (mi >= blocks_m) break;
-
- // Wait for mi, ni packings to be done. This is more fine-grained than
- // waiting for all workers to finish packing.
- while ((copyA && (packingA_done.at(mi) < blocks_k)) ||
- (copyB && (packingB_done.at(ni) < blocks_k)))
- {}
-
- for (Index ki=0; ki < blocks_k; ++ki) {
- const LhsScalar * current_a = copyA ?
- blocksA + (bk*bm) * (mi*blocks_k+ki) :
- leftData + (bk*ki)*stride_a + (bm*mi);
- const RhsScalar * current_b = copyB ?
- blocksB + (bk*bn) * (ni*blocks_k+ki) :
- rightData + (ni*bn)*stride_b + (bk*ki);
-
- Index current_stride_a = copyA ? bm : stride_a;
- Index current_stride_b = copyB ? bk : stride_b;
-
- // Memory may not be zeroed, overwrite instead of adding in first
- // iteration.
- float beta = ki == 0 ? 0 : 1;
-
- Scalar * current_c = buffer + (mi*bm) + (ni*bn)*stride_c;
- internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(
- 0, actual_bm(mi), actual_bn(ni), actual_bk(ki),
- current_stride_a, current_stride_b, stride_c, 1, beta, 0)
- (current_a, current_b, current_c);
- }
- }
-
- workers_done.Notify();
- }
-
- void run() {
- // Parallelization strategy.
- //
- // First pack A into blocks (sharding by m, k) and B (sharding by n,k),
- // then shard by m, n.
- //
- // Do not use advanced ThreadPool queuing, just run a single long-standing
- // function in each thread.
- if (copyA) {
- blocksA = static_cast<LhsScalar*>(device.allocate(
- (blocks_m*bm)*(blocks_k*bk)*sizeof(LhsScalar)));
- }
- if (copyB) {
- blocksB = static_cast<RhsScalar*>(device.allocate(
- (blocks_n*bn)*(blocks_k*bk)*sizeof(RhsScalar)));
- }
-
- for (Index i = 0; i < num_threads; ++i) {
- device.enqueueNoNotification([=]() { worker(); });
- }
-
- workers_done.Wait();
-
- if (copyA) {
- device.deallocate(blocksA);
- }
- if (copyB) {
- device.deallocate(blocksB);
- }
- }
-
- private:
- // real block size for block index in [0, ..., blocks - 1].
- Index actual_bm(Index mi) const {
- return mi != blocks_m - 1 ? bm : m + bm - bm * blocks_m;
- }
- Index actual_bk(Index ki) const {
- return ki != blocks_k - 1 ? bk : k + bk - bk * blocks_k;
- }
- Index actual_bn(Index ni) const {
- return ni != blocks_n - 1 ? bn : n + bn - bn * blocks_n;
- }
-
- const Device& device;
- Index m, k, n;
- Index stride_a, stride_b, stride_c;
- Index bm, bk, bn; // Block sizes.
- Index blocks_m, blocks_k, blocks_n; // Number of blocks in each dimension.
- bool copyA, copyB, transposeA, transposeB;
- Index num_threads;
- Scalar *buffer;
- const LhsScalar *leftData;
- const RhsScalar *rightData;
-
- LhsScalar *blocksA;
- RhsScalar *blocksB;
- // barrier for joining all threads after all done.
- Barrier workers_done;
- // "queues" of (mi,ki), (ki,ni), (mi,ni) jobs packed [0,p)x[0,q) -> [0, p*q)
- std::atomic<uint32_t> packingA_jobs;
- std::atomic<uint32_t> packingB_jobs;
- std::atomic<uint32_t> compute_jobs;
- // already packed blocks for each mi-panel in A and ni-panel in B.
- std::vector<std::atomic<uint8_t>> packingA_done;
- std::vector<std::atomic<uint8_t>> packingB_done;
- };
-#endif
-
};
} // end namespace Eigen
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index b29968b..860a694 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -246,9 +246,6 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
- /// required by sycl in order to extract the sycl accessor
- const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
-
protected:
template <int LoadMode, bool ActuallyVectorize>
struct PacketConv {
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 378f5cc..abdf742 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -100,7 +100,7 @@ class IndexMapper {
}
} else {
for (int i = NumDims - 1; i >= 0; --i) {
- if (static_cast<size_t>(i + 1) < offset) {
+ if (i + 1 < offset) {
m_cudaInputStrides[i] =
m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1];
m_cudaOutputStrides[i] =
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
deleted file mode 100644
index 4247c1c..0000000
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ /dev/null
@@ -1,476 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli Codeplay Software Ltd.
-// Ralph Potter Codeplay Software Ltd.
-// Luke Iwanski Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
-
-namespace Eigen {
-
-/** \class TensorConvolution
- * \ingroup CXX11_Tensor_Module
- *
- * \brief Tensor convolution class.
- *
- *
- */
-template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index,
-typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType>
-struct EigenConvolutionKernel1D{
-typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper;
-Kernel_accessor kernel_filter;
-const size_t kernelSize, range_x, range_y;
-Buffer_accessor buffer_acc;
-Local_accessor local_acc;
-FunctorExpr functors;
-TupleType tuple_of_accessors;
-EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
- Kernel_accessor kernel_filter_, const size_t kernelSize_, const size_t range_x_, const size_t range_y_,
- Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
- :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize(kernelSize_), range_x(range_x_), range_y(range_y_),
- buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
-
- void operator()(cl::sycl::nd_item<2> itemID) {
- typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
- auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
-
- auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
- auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
-
- const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize -1); //the required row to be calculated for the for each plane in shered memory
- const size_t plane_kernel_offset = itemID.get_local(1) * num_x_input;
- const size_t first_input_start = itemID.get_group(0)*itemID.get_local_range()[0];
- const size_t plane_tensor_offset =indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(1));
- /// fill the shared memory
- for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) {
- const size_t local_index = i + plane_kernel_offset ;
- const size_t tensor_index = plane_tensor_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_input_start);
- if(((i + first_input_start) < (range_x +kernelSize-1)) && itemID.get_global(1)< range_y){
- local_acc[local_index] = device_evaluator.coeff(tensor_index);
- }
- else local_acc[local_index]=0.0f;
- }
-
- itemID.barrier(cl::sycl::access::fence_space::local_space);
-
- // calculate the convolution
- const size_t first_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x
- if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y){
- CoeffReturnType result = static_cast<CoeffReturnType>(0);
- const size_t index = plane_kernel_offset+ itemID.get_local(0);
- for (size_t k = 0; k < kernelSize; ++k) {
- result += (local_acc[k + index] * kernel_ptr[k]);
- }
- const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(1))
- +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + first_output_start);
- buffer_ptr[tensor_index] = result;
- }
- }
-};
-
-
-template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index,
-typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType>
-struct EigenConvolutionKernel2D{
-typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper;
-Kernel_accessor kernel_filter;
-const size_t kernelSize_x, kernelSize_y, range_x, range_y , range_z;
-Buffer_accessor buffer_acc;
-Local_accessor local_acc;
-FunctorExpr functors;
-TupleType tuple_of_accessors;
-EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
- Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ ,const size_t range_x_, const size_t range_y_, const size_t range_z_,
- Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
- :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), range_x(range_x_), range_y(range_y_), range_z(range_z_),
- buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
-
- void operator()(cl::sycl::nd_item<3> itemID) {
- typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
- auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
-
- auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
- auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
- const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory
- const size_t num_y_input = (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory
- const size_t plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(2));
- const size_t plane_kernel_offset = itemID.get_local(2) * num_y_input;
-
- /// fill the shared memory
- const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0];
- const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1];
- for (size_t j = itemID.get_local(1); j < num_y_input; j += itemID.get_local_range()[1]) {
- const size_t local_input_offset = num_x_input * (j + plane_kernel_offset);
- for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) {
- const size_t local_index = i + local_input_offset;
- const size_t tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_x_input_start, j+ first_y_input_start );
- if(((i + first_x_input_start) < (range_x +kernelSize_x-1)) &&((j + first_y_input_start) < (range_y +kernelSize_y-1)) && itemID.get_global(2)< range_z){
- local_acc[local_index] = device_evaluator.coeff(tensor_index);
- }
- else local_acc[local_index]=0.0f;
- }
- }
-
- itemID.barrier(cl::sycl::access::fence_space::local_space);
-
- // calculate the convolution
- const size_t fitst_x_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x
- const size_t fitst_y_output_start =itemID.get_group(1)*(itemID.get_local_range()[1]); // output start y
- if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y && itemID.get_global(2)< range_z){
- CoeffReturnType result = static_cast<CoeffReturnType>(0);
- for (size_t j = 0; j < kernelSize_y; j++) {
- size_t kernel_offset =kernelSize_x * j;
- const size_t index = (num_x_input*(plane_kernel_offset + j+ itemID.get_local(1))) + itemID.get_local(0);
- for (size_t i = 0; i < kernelSize_x; i++) {
- result += (local_acc[i + index] * kernel_ptr[i+kernel_offset]);
- }
- }
- const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(2))
- +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start);
- buffer_ptr[tensor_index] = result;
- }
- }
-};
-
-
-
-template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index,
-typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType>
-struct EigenConvolutionKernel3D{
-typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper;
-Kernel_accessor kernel_filter;
-const size_t kernelSize_x, kernelSize_y, kernelSize_z, range_x, range_y , range_z, numP;
-Buffer_accessor buffer_acc;
-Local_accessor local_acc;
-FunctorExpr functors;
-TupleType tuple_of_accessors;
-EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
- Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ , const size_t kernelSize_z_ ,
- const size_t range_x_, const size_t range_y_, const size_t range_z_, const size_t numP_,
- Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
- :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_),
- kernelSize_z(kernelSize_z_), range_x(range_x_), range_y(range_y_), range_z(range_z_), numP(numP_),
- buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
-
- void operator()(cl::sycl::nd_item<3> itemID) {
- typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
- auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
-
- auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
- auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
- const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory
- const size_t num_y_input = (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory
- const size_t num_z_input = (itemID.get_local_range()[2] +kernelSize_z -1); //the required row to be calculated for the for each plane in shered memory
- const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0];
- const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1];
- const size_t first_z_input_start = itemID.get_group(2)*itemID.get_local_range()[2];
- for(size_t p=0; p<numP; p++){
- /// fill the shared memory
- const size_t plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
- for (size_t k = itemID.get_local(2); k < num_z_input; k += itemID.get_local_range()[2]) {
- for (size_t j = itemID.get_local(1); j < num_y_input; j += itemID.get_local_range()[1]) {
- for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) {
- const size_t local_index = i + (num_x_input * (j + (num_y_input * k)));
- const size_t tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_x_input_start, j+ first_y_input_start , k+ first_z_input_start );
- if(((i + first_x_input_start) < (range_x +kernelSize_x-1)) && ((j + first_y_input_start) < (range_y +kernelSize_y-1)) && ((k + first_z_input_start) < (range_z +kernelSize_z-1)) ){
- local_acc[local_index] = device_evaluator.coeff(tensor_index);
- }
- else local_acc[local_index]=0.0f;
- }
- }
- }
- itemID.barrier(cl::sycl::access::fence_space::local_space);
-
- // calculate the convolution
- const size_t fitst_x_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // x
- const size_t fitst_y_output_start =itemID.get_group(1)*(itemID.get_local_range()[1]); // y
- const size_t fitst_z_output_start =itemID.get_group(2)*(itemID.get_local_range()[2]); // z
-
- if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y && itemID.get_global(2)< range_z){
- CoeffReturnType result = static_cast<CoeffReturnType>(0);
- for (size_t k = 0; k < kernelSize_z; k++) {
- for (size_t j = 0; j < kernelSize_y; j++) {
- for (size_t i = 0; i < kernelSize_x; i++) {
- const size_t kernel_index =i + kernelSize_x * (j + kernelSize_y * k);
- const size_t local_index = ((i+ itemID.get_local(0))+ num_x_input*((j+ itemID.get_local(1)) + num_y_input * (k+ itemID.get_local(2))));
- result += (local_acc[local_index] * kernel_ptr[kernel_index]);
- }
- }
- }
- const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p)
- +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start, itemID.get_local(2) + fitst_z_output_start );
- buffer_ptr[tensor_index] = result;
- }
-
- itemID.barrier(cl::sycl::access::fence_space::local_space);
- }
- }
-};
-
-
-template<typename Indices, typename InputArgType, typename KernelArgType>
-struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, const Eigen::SyclDevice>
-{
- typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
-
- static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Dimensions>::value;
- static const int NumKernelDims = internal::array_size<Indices>::value;
- typedef typename XprType::Index Index;
- typedef DSizes<Index, NumDims> Dimensions;
- typedef typename TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Dimensions KernelDimensions;
- typedef const Eigen::SyclDevice Device;
-
- enum {
- IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned,
- PacketAccess = false,
- Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout,
- CoordAccess = false, // to be implemented
- RawAccess = false
- };
-
- EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Eigen::SyclDevice& device)
- : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
- {
- EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
-
- const typename TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
- const typename TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
-
- m_dimensions = m_inputImpl.dimensions();
- for (int i = 0; i < NumKernelDims; ++i) {
- const Index index = op.indices()[i];
- const Index input_dim = input_dims[index];
- const Index kernel_dim = kernel_dims[i];
- const Index result_dim = input_dim - kernel_dim + 1;
- m_dimensions[index] = result_dim;
- }
- }
-
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, const Eigen::SyclDevice>::type PacketReturnType;
- typedef typename InputArgType::Scalar Scalar;
- static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
-
- EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
- preloadKernel();
- m_inputImpl.evalSubExprsIfNeeded(NULL);
- if (data) {
- executeEval(data);
- return false;
- } else {
- m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
- executeEval(m_buf);
- return true;
- }
- }
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
- m_inputImpl.cleanup();
- if (m_buf) {
- m_device.deallocate(m_buf);
- m_buf = NULL;
- }
- if (m_local_kernel) {
- m_device.deallocate((void*)m_kernel);
- m_local_kernel = false;
- }
- m_kernel = NULL;
- }
- /// used by sycl in order to build the sycl buffer
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;}
- /// used by sycl in order to build the sycl buffer
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return m_buf; }
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
- // Don't make a local copy of the kernel unless we have to (i.e. it's an
- // expression that needs to be evaluated)
- const Scalar* in_place = m_kernelImpl.data();
- if (in_place) {
- m_kernel = in_place;
- m_local_kernel = false;
- } else {
- size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
- Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
- typedef TensorEvalToOp<const KernelArgType> EvalTo;
- EvalTo evalToTmp(local, m_kernelArg);
- const bool PacketAccess = internal::IsVectorizable<const Eigen::SyclDevice, KernelArgType>::value;
- internal::TensorExecutor<const EvalTo, const Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
- m_kernel = local;
- m_local_kernel = true;
- }
- }
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(Scalar* data) const {
- typedef TensorEvaluator<InputArgType, const Eigen::SyclDevice> InputEvaluator;
- typedef typename InputEvaluator::Dimensions InputDims;
-
- typedef Eigen::TensorSycl::internal::FunctorExtractor<InputEvaluator> InputFunctorExpr;
- // extract input functor list
- InputFunctorExpr input_functors = Eigen::TensorSycl::internal::extractFunctors(m_inputImpl);
-
-
- m_device.sycl_queue().submit([&](cl::sycl::handler &cgh) {
-
- typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> InputLocalAcc;
- /// work-around for gcc 4.8 auto bug
- typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl)) InputTupleType;
- // create input tuple of accessors
- InputTupleType tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl);
-
- typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> OutputAccessorType;
- OutputAccessorType out_res= m_device. template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, data);
- typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> KernelAccessorType;
- KernelAccessorType kernel_acc= m_device. template get_sycl_accessor<cl::sycl::access::mode::read>(cgh, m_kernel);
-
- switch (NumKernelDims) {
- case 1: {
- const size_t numX = dimensions()[m_indices[0]];
- const size_t numP = dimensions().TotalSize() / numX;
- const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
- size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y;
- m_device.parallel_for_setup(numX, numP, tileSize_x,tileSize_y,range_x,range_y, GRange_x, GRange_y );
- const size_t shared_mem =(tileSize_x +kernel_size -1)*(tileSize_y);
- assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
- auto global_range=cl::sycl::range<2>(GRange_x, GRange_y); // global range
- auto local_range=cl::sycl::range<2>(tileSize_x, tileSize_y); // local range
- InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
- const array<Index, 1> indices{{m_indices[0]}};
- const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
- internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
- cgh.parallel_for(cl::sycl::nd_range<2>(global_range, local_range),
- EigenConvolutionKernel1D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
- InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
- indexMapper,kernel_acc, kernel_size, numX, numP, out_res, local_acc, input_functors, tuple_of_accessors));
- break;
- }
-
- case 2: {
- const size_t idxX =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
- const size_t idxY =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
- const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX];
- const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY];
- const size_t numX = dimensions()[m_indices[idxX]];
- const size_t numY = dimensions()[m_indices[idxY]];
- const size_t numP = dimensions().TotalSize() / (numX*numY);
- size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z;
- m_device.parallel_for_setup(numX, numY, numP, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z );
- const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * tileSize_z;
- assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
- auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range
- auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range
- InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
- const array<Index, 2> indices {{m_indices[idxX], m_indices[idxY]}};
- const array<Index, 2> kernel_dims{{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY]}};
- internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
- cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range),
- EigenConvolutionKernel2D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
- InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
- indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, local_acc, input_functors, tuple_of_accessors));
- break;
- }
-
- case 3: {
- const size_t idxX =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
- const size_t idxY =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
- const size_t idxZ =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
- const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX];
- const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY];
- const size_t kernel_size_z = m_kernelImpl.dimensions()[idxZ];
- const size_t numX = dimensions()[m_indices[idxX]];
- const size_t numY = dimensions()[m_indices[idxY]];
- const size_t numZ = dimensions()[m_indices[idxZ]];
- const size_t numP = dimensions().TotalSize() / (numX*numY*numZ);
- const array<Index, 3> indices{{m_indices[idxX], m_indices[idxY], m_indices[idxZ]}};
- const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[idxX],m_kernelImpl.dimensions()[idxY], m_kernelImpl.dimensions()[idxZ]}};
- internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
- size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z;
- m_device.parallel_for_setup(numX, numY, numZ, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z );
- const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * (tileSize_z +kernel_size_y -1);
- assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
- auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range
- auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range
- InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
- cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range),
- EigenConvolutionKernel3D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
- InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
- indexMapper,kernel_acc, kernel_size_x, kernel_size_y, kernel_size_z, numX, numY,
- numZ, numP, out_res, local_acc, input_functors, tuple_of_accessors));
- break;
- }
-
- default: {
- EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
- }
- }
- });
- }
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
- {
- eigen_assert(m_buf);
- eigen_assert(index < m_dimensions.TotalSize());
- return m_buf[index];
- }
-
- template<int LoadMode>
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
- {
- eigen_assert(m_buf);
- eigen_assert(index < m_dimensions.TotalSize());
- return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
- }
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
- costPerCoeff(bool vectorized) const {
- // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
- // model.
- const double kernel_size = m_kernelImpl.dimensions().TotalSize();
- // We ignore the use of fused multiply-add.
- const double convolve_compute_cost =
- TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
- const double firstIndex_compute_cost =
- NumDims *
- (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
- TensorOpCost::DivCost<Index>());
- return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
- kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
- m_kernelImpl.costPerCoeff(vectorized) +
- TensorOpCost(0, 0, convolve_compute_cost, vectorized,
- PacketSize));
- }
-
- private:
- // No assignment (copies are needed by the kernels)
- TensorEvaluator& operator = (const TensorEvaluator&);
- TensorEvaluator<InputArgType, const Eigen::SyclDevice> m_inputImpl;
- KernelArgType m_kernelArg;
- TensorEvaluator<KernelArgType, const Eigen::SyclDevice> m_kernelImpl;
- Indices m_indices;
- Dimensions m_dimensions;
- Scalar* m_buf;
- const Scalar* m_kernel;
- bool m_local_kernel;
- const Eigen::SyclDevice& m_device;
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index be8d693..4f5767b 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -88,7 +88,7 @@ static void initializeDeviceProp() {
#if __cplusplus >= 201103L
std::atomic_thread_fence(std::memory_order_acquire);
#endif
- EIGEN_SLEEP(1000);
+ sleep(1);
}
}
}
@@ -217,10 +217,7 @@ struct GpuDevice {
EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess);
#else
- EIGEN_UNUSED_VARIABLE(dst);
- EIGEN_UNUSED_VARIABLE(src);
- EIGEN_UNUSED_VARIABLE(n);
- eigen_assert(false && "The default device should be used instead to generate kernel code");
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
}
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
index ccaaa6c..9d14139 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
@@ -45,7 +45,7 @@ struct DefaultDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
-#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
+#ifndef __CUDA_ARCH__
// Running on the host CPU
return l1CacheSize();
#else
@@ -55,7 +55,7 @@ struct DefaultDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
-#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
+#ifndef __CUDA_ARCH__
// Running single threaded on the host CPU
return l3CacheSize();
#else
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
index e209799..7c03989 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -16,400 +16,107 @@
#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
namespace Eigen {
-
- #define ConvertToActualTypeSycl(Scalar, buf_acc) reinterpret_cast<typename cl::sycl::global_ptr<Scalar>::pointer_t>((&(*buf_acc.get_pointer())))
-
- template <typename Scalar, typename read_accessor, typename write_accessor> class MemCopyFunctor {
- public:
- MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset) : m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {}
-
- void operator()(cl::sycl::nd_item<1> itemID) {
- auto src_ptr = ConvertToActualTypeSycl(Scalar, m_src_acc);
- auto dst_ptr = ConvertToActualTypeSycl(Scalar, m_dst_acc);
- auto globalid = itemID.get_global_linear_id();
- if (globalid < m_rng) {
- dst_ptr[globalid + m_i] = src_ptr[globalid + m_offset];
- }
- }
-
- private:
- read_accessor m_src_acc;
- write_accessor m_dst_acc;
- size_t m_rng;
- size_t m_i;
- size_t m_offset;
- };
-
- struct memsetkernelFunctor{
- typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> AccType;
- AccType m_acc;
- const size_t m_rng, m_c;
- memsetkernelFunctor(AccType acc, const size_t rng, const size_t c):m_acc(acc), m_rng(rng), m_c(c){}
- void operator()(cl::sycl::nd_item<1> itemID) {
- auto globalid=itemID.get_global_linear_id();
- if (globalid< m_rng) m_acc[globalid] = m_c;
- }
-
- };
-
-EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){
- auto devices = cl::sycl::device::get_devices();
- std::vector<cl::sycl::device>::iterator it =devices.begin();
- while(it!=devices.end()) {
- /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU )
- auto s= (*it).template get_info<cl::sycl::info::device::vendor>();
- std::transform(s.begin(), s.end(), s.begin(), ::tolower);
- if((*it).is_cpu() && s.find("amd")!=std::string::npos && s.find("apu") == std::string::npos){ // remove amd cpu as it is not supported by computecpp allow APUs
- it=devices.erase(it);
- }
- else{
- ++it;
- }
- }
- return devices;
-}
-
-struct QueueInterface {
- /// class members:
- bool exception_caught_ = false;
-
- mutable std::mutex mutex_;
-
+struct SyclDevice {
+ /// class members
+ /// sycl queue
+ mutable cl::sycl::queue m_queue;
/// std::map is the container used to make sure that we create only one buffer
/// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice.
/// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it.
- mutable std::map<const uint8_t *, cl::sycl::buffer<uint8_t, 1>> buffer_map;
- /// sycl queue
- mutable cl::sycl::queue m_queue;
- /// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured through dev_Selector typename
- /// SyclStreamDevice is not owned. it is the caller's responsibility to destroy it.
- template<typename dev_Selector> explicit QueueInterface(const dev_Selector& s):
+ mutable std::map<const void *, std::shared_ptr<void>> buffer_map;
+ /// creating device by using selector
+ template<typename dev_Selector> SyclDevice(dev_Selector s)
+ :
#ifdef EIGEN_EXCEPTIONS
- m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
+ m_queue(cl::sycl::queue(s, [=](cl::sycl::exception_list l) {
for (const auto& e : l) {
try {
- if (e) {
- exception_caught_ = true;
- std::rethrow_exception(e);
- }
+ std::rethrow_exception(e);
} catch (cl::sycl::exception e) {
- std::cerr << e.what() << std::endl;
- }
+ std::cout << e.what() << std::endl;
+ }
}
}))
#else
-m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
- for (const auto& e : l) {
- if (e) {
- exception_caught_ = true;
- std::cerr << "Error detected Inside Sycl Device."<< std::endl;
-
- }
- }
-}))
+ m_queue(cl::sycl::queue(s))
#endif
{}
+ // destructor
+ ~SyclDevice() { deallocate_all(); }
- /// Allocating device pointer. This pointer is actually an 8 bytes host pointer used as key to access the sycl device buffer.
- /// The reason is that we cannot use device buffer as a pointer as a m_data in Eigen leafNode expressions. So we create a key
- /// pointer to be used in Eigen expression construction. When we convert the Eigen construction into the sycl construction we
- /// use this pointer as a key in our buffer_map and we make sure that we dedicate only one buffer only for this pointer.
- /// The device pointer would be deleted by calling deallocate function.
- EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
- auto buf = cl::sycl::buffer<uint8_t,1>(cl::sycl::range<1>(num_bytes));
- auto ptr =buf.get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>().get_pointer();
- buf.set_final_data(nullptr);
- std::lock_guard<std::mutex> lock(mutex_);
- buffer_map.insert(std::pair<const uint8_t *, cl::sycl::buffer<uint8_t, 1>>(static_cast<const uint8_t*>(ptr),buf));
- return static_cast<void*>(ptr);
- }
-
- /// This is used to deallocate the device pointer. p is used as a key inside
- /// the map to find the device buffer and delete it.
- EIGEN_STRONG_INLINE void deallocate(void *p) const {
- std::lock_guard<std::mutex> lock(mutex_);
- auto it = buffer_map.find(static_cast<const uint8_t*>(p));
+ template <typename T> void deallocate(T *p) const {
+ auto it = buffer_map.find(p);
if (it != buffer_map.end()) {
buffer_map.erase(it);
+ internal::aligned_free(p);
}
}
-
- EIGEN_STRONG_INLINE void deallocate_all() const {
- std::lock_guard<std::mutex> lock(mutex_);
- buffer_map.clear();
- }
-
- EIGEN_STRONG_INLINE std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1>>::iterator find_buffer(const void* ptr) const {
- std::lock_guard<std::mutex> lock(mutex_);
- auto it1 = buffer_map.find(static_cast<const uint8_t*>(ptr));
- if (it1 != buffer_map.end()){
- return it1;
- }
- else{
- for(std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1>>::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){
- auto size = it->second.get_size();
- if((it->first < (static_cast<const uint8_t*>(ptr))) && ((static_cast<const uint8_t*>(ptr)) < (it->first + size)) ) return it;
- }
- }
- std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl;
- abort();
- }
-
- // This function checks if the runtime recorded an error for the
- // underlying stream device.
- EIGEN_STRONG_INLINE bool ok() const {
- if (!exception_caught_) {
- m_queue.wait_and_throw();
+ void deallocate_all() const {
+ std::map<const void *, std::shared_ptr<void>>::iterator it=buffer_map.begin();
+ while (it!=buffer_map.end()) {
+ auto p=it->first;
+ buffer_map.erase(it);
+ internal::aligned_free(const_cast<void*>(p));
+ it=buffer_map.begin();
}
- return !exception_caught_;
+ buffer_map.clear();
}
- // destructor
- ~QueueInterface() { buffer_map.clear(); }
-};
-
-struct SyclDevice {
- // class member.
- QueueInterface* m_queue_stream;
- /// QueueInterface is not owned. it is the caller's responsibility to destroy it.
- explicit SyclDevice(QueueInterface* queue_stream) : m_queue_stream(queue_stream){}
-
- /// Creation of sycl accessor for a buffer. This function first tries to find
+ /// creation of sycl accessor for a buffer. This function first tries to find
/// the buffer in the buffer_map. If found it gets the accessor from it, if not,
- /// the function then adds an entry by creating a sycl buffer for that particular pointer.
- template <cl::sycl::access::mode AcMd> EIGEN_STRONG_INLINE cl::sycl::accessor<uint8_t, 1, AcMd, cl::sycl::access::target::global_buffer>
- get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const {
- return (get_sycl_buffer(ptr).template get_access<AcMd, cl::sycl::access::target::global_buffer>(cgh));
+ ///the function then adds an entry by creating a sycl buffer for that particular pointer.
+ template <cl::sycl::access::mode AcMd, typename T> inline cl::sycl::accessor<T, 1, AcMd, cl::sycl::access::target::global_buffer>
+ get_sycl_accessor(size_t num_bytes, cl::sycl::handler &cgh, const T * ptr) const {
+ return (get_sycl_buffer<T>(num_bytes, ptr)->template get_access<AcMd, cl::sycl::access::target::global_buffer>(cgh));
}
- /// Accessing the created sycl device buffer for the device pointer
- EIGEN_STRONG_INLINE cl::sycl::buffer<uint8_t, 1>& get_sycl_buffer(const void * ptr) const {
- return m_queue_stream->find_buffer(ptr)->second;
+ template<typename T> inline std::pair<std::map<const void *, std::shared_ptr<void>>::iterator,bool> add_sycl_buffer(const T *ptr, size_t num_bytes) const {
+ using Type = cl::sycl::buffer<T, 1>;
+ std::pair<std::map<const void *, std::shared_ptr<void>>::iterator,bool> ret = buffer_map.insert(std::pair<const void *, std::shared_ptr<void>>(ptr, std::shared_ptr<void>(new Type(cl::sycl::range<1>(num_bytes)),
+ [](void *dataMem) { delete static_cast<Type*>(dataMem); })));
+ (static_cast<Type*>(buffer_map.at(ptr).get()))->set_final_data(nullptr);
+ return ret;
}
- /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
- template<typename Index>
- EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const {
- tileSize =static_cast<Index>(sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>());
- auto s= sycl_queue().get_device().template get_info<cl::sycl::info::device::vendor>();
- std::transform(s.begin(), s.end(), s.begin(), ::tolower);
- if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
- tileSize=std::min(static_cast<Index>(256), static_cast<Index>(tileSize));
- }
- rng = n;
- if (rng==0) rng=static_cast<Index>(1);
- GRange=rng;
- if (tileSize>GRange) tileSize=GRange;
- else if(GRange>tileSize){
- Index xMode = static_cast<Index>(GRange % tileSize);
- if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
- }
- }
-
- /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
- template<typename Index>
- EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const {
- Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
- if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
- max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
- }
- Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
- tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2)));
- rng1=dim1;
- if (rng1==0 ) rng1=static_cast<Index>(1);
- GRange1=rng1;
- if (tileSize1>GRange1) tileSize1=GRange1;
- else if(GRange1>tileSize1){
- Index xMode = static_cast<Index>(GRange1 % tileSize1);
- if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
- }
- tileSize0 = static_cast<Index>(max_workgroup_Size/tileSize1);
- rng0 = dim0;
- if (rng0==0 ) rng0=static_cast<Index>(1);
- GRange0=rng0;
- if (tileSize0>GRange0) tileSize0=GRange0;
- else if(GRange0>tileSize0){
- Index xMode = static_cast<Index>(GRange0 % tileSize0);
- if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
- }
+ template <typename T> inline cl::sycl::buffer<T, 1>* get_sycl_buffer(size_t num_bytes,const T * ptr) const {
+ return static_cast<cl::sycl::buffer<T, 1>*>(add_sycl_buffer(ptr, num_bytes).first->second.get());
}
-
-
- /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
- template<typename Index>
- EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const {
- Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
- if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
- max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
- }
- Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
- tileSize2 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/3)));
- rng2=dim2;
- if (rng2==0 ) rng1=static_cast<Index>(1);
- GRange2=rng2;
- if (tileSize2>GRange2) tileSize2=GRange2;
- else if(GRange2>tileSize2){
- Index xMode = static_cast<Index>(GRange2 % tileSize2);
- if (xMode != 0) GRange2 += static_cast<Index>(tileSize2 - xMode);
- }
- pow_of_2 = static_cast<Index>(std::log2(static_cast<Index>(max_workgroup_Size/tileSize2)));
- tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2)));
- rng1=dim1;
- if (rng1==0 ) rng1=static_cast<Index>(1);
- GRange1=rng1;
- if (tileSize1>GRange1) tileSize1=GRange1;
- else if(GRange1>tileSize1){
- Index xMode = static_cast<Index>(GRange1 % tileSize1);
- if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
- }
- tileSize0 = static_cast<Index>(max_workgroup_Size/(tileSize1*tileSize2));
- rng0 = dim0;
- if (rng0==0 ) rng0=static_cast<Index>(1);
- GRange0=rng0;
- if (tileSize0>GRange0) tileSize0=GRange0;
- else if(GRange0>tileSize0){
- Index xMode = static_cast<Index>(GRange0 % tileSize0);
- if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
- }
- }
- /// allocate device memory
- EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
- return m_queue_stream->allocate(num_bytes);
+ /// allocating memory on the cpu
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void *allocate(size_t) const {
+ return internal::aligned_malloc(8);
}
- /// deallocate device memory
- EIGEN_STRONG_INLINE void deallocate(void *p) const {
- m_queue_stream->deallocate(p);
- }
// some runtime conditions that can be applied here
- EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; }
+ bool isDeviceSuitable() const { return true; }
- /// the memcpy function
- template<typename Index> EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
- auto it1 = m_queue_stream->find_buffer(static_cast<const void*>(src));
- auto it2 = m_queue_stream->find_buffer(dst);
- auto offset= (static_cast<const uint8_t*>(static_cast<const void*>(src))) - it1->first;
- auto i= (static_cast<const uint8_t*>(dst)) - it2->first;
- offset/=sizeof(Index);
- i/=sizeof(Index);
- size_t rng, GRange, tileSize;
- parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange);
- sycl_queue().submit([&](cl::sycl::handler &cgh) {
- auto src_acc =it1->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
- auto dst_acc =it2->second.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh);
- typedef decltype(src_acc) read_accessor;
- typedef decltype(dst_acc) write_accessor;
- cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, i, offset));
- });
- synchronize();
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
+ ::memcpy(dst, src, n);
}
- /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device
- /// pointer created as a key we find the sycl buffer and get the host accessor with discard_write mode
- /// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the
- /// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that
- /// this buffer is accessed, the data will be copied to the device.
- template<typename Index> EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const {
- auto host_acc= get_sycl_buffer(dst). template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>();
- ::memcpy(host_acc.get_pointer(), src, n);
+ template<typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(T *dst, const T *src, size_t n) const {
+ auto host_acc= (static_cast<cl::sycl::buffer<T, 1>*>(add_sycl_buffer(dst, n).first->second.get()))-> template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>();
+ memcpy(host_acc.get_pointer(), src, n);
}
- /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl
- /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the
- /// lifespan of the memcpyDeviceToHost function. We create a kernel to copy the data, from the device- only source buffer to the destination
- /// buffer with map_allocator on the gpu in parallel. At the end of the function call the destination buffer would be destroyed and the data
- /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back
- /// to the cpu only once per function call.
- template<typename Index> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const {
- auto it = m_queue_stream->find_buffer(src);
- auto offset =static_cast<const uint8_t*>(static_cast<const void*>(src))- it->first;
- offset/=sizeof(Index);
- size_t rng, GRange, tileSize;
- parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange);
- // Assuming that the dst is the start of the destination pointer
- auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(n));
- sycl_queue().submit([&](cl::sycl::handler &cgh) {
- auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
- auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
- typedef decltype(src_acc) read_accessor;
- typedef decltype(dst_acc) write_accessor;
- cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, 0, offset));
- });
- synchronize();
- }
- /// returning the sycl queue
- EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;}
- /// Here is the implementation of memset function on sycl.
- EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
- size_t rng, GRange, tileSize;
- parallel_for_setup(n, tileSize, rng, GRange);
- sycl_queue().submit(memsetCghFunctor(get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(data))),rng, GRange, tileSize, c ));
- synchronize();
- }
-
- struct memsetCghFunctor{
- cl::sycl::buffer<uint8_t, 1>& m_buf;
- const size_t& rng , GRange, tileSize;
- const int &c;
- memsetCghFunctor(cl::sycl::buffer<uint8_t, 1>& buff, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_)
- :m_buf(buff), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){}
-
- void operator()(cl::sycl::handler &cgh) const {
- auto buf_acc = m_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
- cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, rng, c));
+ /// whith the current implementation of sycl, the data is copied twice from device to host. This will be fixed soon.
+ template<typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(T *dst, const T *src, size_t n) const {
+ auto it = buffer_map.find(src);
+ if (it != buffer_map.end()) {
+ auto host_acc= (static_cast<cl::sycl::buffer<T, 1>*>(it->second.get()))-> template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::host_buffer>();
+ memcpy(dst,host_acc.get_pointer(), n);
+ } else{
+ eigen_assert("no device memory found. The memory might be destroyed before creation");
}
- };
-
- EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
- // FIXME
- return 48*1024;
- }
-
- EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
- // We won't try to take advantage of the l2 cache for the time being, and
- // there is no l3 cache on cuda devices.
- return firstLevelCacheSize();
- }
- EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
- return sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_compute_units>();
- // return stream_->deviceProperties().multiProcessorCount;
}
- EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
- return sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>();
- // return stream_->deviceProperties().maxThreadsPerBlock;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void *buffer, int c, size_t n) const {
+ ::memset(buffer, c, n);
}
- EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
- // OpenCL doesnot have such concept
- return 2;//sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>();
- // return stream_->deviceProperties().maxThreadsPerMultiProcessor;
- }
- EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
- return sycl_queue().get_device(). template get_info<cl::sycl::info::device::local_mem_size>();
- // return stream_->deviceProperties().sharedMemPerBlock;
- }
- /// No need for sycl it should act the same as CPU version
- EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
-
- EIGEN_STRONG_INLINE void synchronize() const {
- sycl_queue().wait_and_throw(); //pass
- }
-
- EIGEN_STRONG_INLINE void asynchronousExec() const {
- ///FIXEDME:: currently there is a race condition regarding the asynch scheduler.
- //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled
- sycl_queue().wait_and_throw(); //pass
-
- }
- // This function checks if the runtime recorded an error for the
- // underlying stream device.
- EIGEN_STRONG_INLINE bool ok() const {
- return m_queue_stream->ok();
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+ return 1;
}
};
-
-
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index 16180ca..069680a 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -12,6 +12,17 @@
namespace Eigen {
+// Use the SimpleThreadPool by default. We'll switch to the new non blocking
+// thread pool later.
+#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
+template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
+typedef NonBlockingThreadPool ThreadPool;
+#else
+template <typename Env> using ThreadPoolTempl = SimpleThreadPoolTempl<Env>;
+typedef SimpleThreadPool ThreadPool;
+#endif
+
+
// Barrier is an object that allows one or more threads to wait until
// Notify has been called a specified number of times.
class Barrier {
@@ -245,7 +256,7 @@ struct ThreadPoolDevice {
// Split into halves and submit to the pool.
Index mid = first + divup((last - first) / 2, block_size) * block_size;
pool_->Schedule([=, &handleRange]() { handleRange(mid, last); });
- handleRange(first, mid);
+ pool_->Schedule([=, &handleRange]() { handleRange(first, mid); });
};
handleRange(0, n);
barrier.Wait();
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index 86405e6..b24cdeb 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -33,7 +33,7 @@ namespace Eigen {
namespace internal {
template<std::size_t n, typename Dimension> struct dget {
- static const std::ptrdiff_t value = get<n, Dimension>::value;
+ static const std::size_t value = get<n, Dimension>::value;
};
@@ -90,11 +90,9 @@ struct fixed_size_tensor_index_extraction_helper<Index, 0>
// Fixed size
#ifndef EIGEN_EMULATE_CXX11_META_H
template <typename std::ptrdiff_t... Indices>
-struct Sizes {
+struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> {
typedef internal::numeric_list<std::ptrdiff_t, Indices...> Base;
- const Base t = Base();
static const std::ptrdiff_t total_size = internal::arg_prod(Indices...);
- static const size_t count = Base::count;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const {
return Base::count;
@@ -122,16 +120,16 @@ struct Sizes {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::size_t index) const {
- return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, t);
+ return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, *this);
}
template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
- return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, t);
+ return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, *static_cast<const Base*>(this));
}
template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
- return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, t);
+ return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, *static_cast<const Base*>(this));
}
};
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index 82dd1e6..0698713 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -41,9 +41,6 @@ struct traits<TensorEvalToOp<XprType, MakePointer_> >
// Intermediate typedef to workaround MSVC issue.
typedef MakePointer_<T> MakePointerT;
typedef typename MakePointerT::Type Type;
- typedef typename MakePointerT::RefType RefType;
-
-
};
};
@@ -120,7 +117,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& op() const {
return m_op;
}
-
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {
}
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index d641581..834ce07 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -32,7 +32,6 @@ struct TensorEvaluator
typedef typename Derived::Scalar CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef typename Derived::Dimensions Dimensions;
- typedef Derived XprType;
// NumDimensions is -1 for variable dim tensors
static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
@@ -69,9 +68,7 @@ struct TensorEvaluator
return m_data[index];
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- typename internal::traits<Derived>::template MakePointer<Scalar>::RefType
- coeffRef(Index index) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
eigen_assert(m_data);
return m_data[index];
}
@@ -97,9 +94,7 @@ struct TensorEvaluator
}
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- typename internal::traits<Derived>::template MakePointer<Scalar>::RefType
- coeffRef(const array<DenseIndex, NumCoords>& coords) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<DenseIndex, NumCoords>& coords) {
eigen_assert(m_data);
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
return m_data[m_dims.IndexOfColMajor(coords)];
@@ -157,8 +152,6 @@ struct TensorEvaluator<const Derived, Device>
typedef typename Derived::Scalar CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef typename Derived::Dimensions Dimensions;
- typedef const Derived XprType;
-
// NumDimensions is -1 for variable dim tensors
static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index f060191..08eb559 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -253,7 +253,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
// get data into line_buf
const Index stride = m_strides[dim];
if (stride == 1) {
- m_device.memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar));
+ memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar));
} else {
Index offset = base_offset;
for (int j = 0; j < line_len; ++j, offset += stride) {
@@ -271,7 +271,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
// write back
if (FFTDir == FFT_FORWARD && stride == 1) {
- m_device.memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar));
+ memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar));
} else {
Index offset = base_offset;
const ComplexScalar div_factor = ComplexScalar(1.0 / line_len, 0);
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index abe85c8..bbd5eb3 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -26,8 +26,8 @@ namespace Eigen {
/// Therefore, by adding the default value, we managed to convert the type and it does not break any
/// existing code as its default value is T*.
namespace internal {
-template<typename XprType>
-struct traits<TensorForcedEvalOp<XprType> >
+template<typename XprType, template <class> class MakePointer_>
+struct traits<TensorForcedEvalOp<XprType, MakePointer_> >
{
// Type promotion to handle the case where the types of the lhs and the rhs are different.
typedef typename XprType::Scalar Scalar;
@@ -42,26 +42,31 @@ struct traits<TensorForcedEvalOp<XprType> >
enum {
Flags = 0
};
+ template <class T> struct MakePointer {
+ // Intermediate typedef to workaround MSVC issue.
+ typedef MakePointer_<T> MakePointerT;
+ typedef typename MakePointerT::Type Type;
+ };
};
-template<typename XprType>
-struct eval<TensorForcedEvalOp<XprType>, Eigen::Dense>
+template<typename XprType, template <class> class MakePointer_>
+struct eval<TensorForcedEvalOp<XprType, MakePointer_>, Eigen::Dense>
{
- typedef const TensorForcedEvalOp<XprType>& type;
+ typedef const TensorForcedEvalOp<XprType, MakePointer_>& type;
};
-template<typename XprType>
-struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<XprType> >::type>
+template<typename XprType, template <class> class MakePointer_>
+struct nested<TensorForcedEvalOp<XprType, MakePointer_>, 1, typename eval<TensorForcedEvalOp<XprType, MakePointer_> >::type>
{
- typedef TensorForcedEvalOp<XprType> type;
+ typedef TensorForcedEvalOp<XprType, MakePointer_> type;
};
} // end namespace internal
-template<typename XprType>
-class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOnlyAccessors>
+template<typename XprType, template <class> class MakePointer_>
+class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType, MakePointer_>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar;
@@ -83,10 +88,10 @@ class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOn
};
-template<typename ArgType, typename Device>
-struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
+template<typename ArgType, typename Device, template <class> class MakePointer_>
+struct TensorEvaluator<const TensorForcedEvalOp<ArgType, MakePointer_>, Device>
{
- typedef TensorForcedEvalOp<ArgType> XprType;
+ typedef TensorForcedEvalOp<ArgType, MakePointer_> XprType;
typedef typename ArgType::Scalar Scalar;
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
typedef typename XprType::Index Index;
@@ -102,7 +107,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
};
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
- /// op_ is used for sycl
+ /// op_ is used for sycl
: m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL)
{ }
@@ -143,17 +148,17 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return m_buffer; }
+ EIGEN_DEVICE_FUNC typename MakePointer<Scalar>::Type data() const { return m_buffer; }
/// required by sycl in order to extract the sycl accessor
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() { return m_impl; }
+ const TensorEvaluator<ArgType, Device>& impl() { return m_impl; }
/// used by sycl in order to build the sycl buffer
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;}
+ const Device& device() const{return m_device;}
private:
TensorEvaluator<ArgType, Device> m_impl;
const ArgType m_op;
const Device& m_device;
- CoeffReturnType* m_buffer;
+ typename MakePointer<CoeffReturnType>::Type m_buffer;
};
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index 2e63899..52b803d 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -20,19 +20,7 @@ namespace Eigen {
// map_allocator.
template<typename T> struct MakePointer {
typedef T* Type;
- typedef T& RefType;
};
-#if defined(EIGEN_USE_SYCL)
-namespace TensorSycl {
-namespace internal{
-template <typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor;
-template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Op, typename Dims, typename Index, typename TupleType>
-class FullReductionKernelFunctor;
-}
-}
-#endif
-
-
template<typename PlainObjectType, int Options_ = Unaligned, template <class> class MakePointer_ = MakePointer> class TensorMap;
template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType = DenseIndex> class Tensor;
@@ -75,7 +63,7 @@ template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp;
template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp;
template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorEvalToOp;
-template<typename XprType> class TensorForcedEvalOp;
+template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorForcedEvalOp;
template<typename ExpressionType, typename DeviceType> class TensorDevice;
template<typename Derived, typename Device> struct TensorEvaluator;
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index 3b4f8ed..d73f6dc 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -33,7 +33,7 @@ struct functor_traits<scalar_mod_op<Scalar> >
*/
template <typename Scalar>
struct scalar_mod2_op {
- EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op)
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op);
EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; }
};
template <typename Scalar>
@@ -42,7 +42,7 @@ struct functor_traits<scalar_mod2_op<Scalar> >
template <typename Scalar>
struct scalar_fmod_op {
- EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op)
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op);
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
operator()(const Scalar& a, const Scalar& b) const {
return numext::fmod(a, b);
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index ef1c9c4..ede3939 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -37,8 +37,6 @@ namespace {
{
#ifdef __CUDA_ARCH__
return __clz(val);
-#elif defined(__SYCL_DEVICE_ONLY__)
- return cl::sycl::clz(val);
#elif EIGEN_COMP_MSVC
unsigned long index;
_BitScanReverse(&index, val);
@@ -55,8 +53,6 @@ namespace {
{
#ifdef __CUDA_ARCH__
return __clzll(val);
-#elif defined(__SYCL_DEVICE_ONLY__)
- return cl::sycl::clz(val);
#elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64
unsigned long index;
_BitScanReverse64(&index, val);
@@ -92,8 +88,6 @@ namespace {
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
#if defined(__CUDA_ARCH__)
return __umulhi(a, b);
-#elif defined(__SYCL_DEVICE_ONLY__)
- return cl::sycl::mul_hi(a, static_cast<uint32_t>(b));
#else
return (static_cast<uint64_t>(a) * b) >> 32;
#endif
@@ -103,8 +97,6 @@ namespace {
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
#if defined(__CUDA_ARCH__)
return __umul64hi(a, b);
-#elif defined(__SYCL_DEVICE_ONLY__)
- return cl::sycl::mul_hi(a, static_cast<uint64_t>(b));
#elif defined(__SIZEOF_INT128__)
__uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b);
return static_cast<uint64_t>(v >> 64);
@@ -124,7 +116,7 @@ namespace {
template <typename T>
struct DividerHelper<64, T> {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
-#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
+#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__)
return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
#else
const uint64_t shift = 1ULL << log_div;
@@ -205,8 +197,6 @@ class TensorIntDivisor<int32_t, true> {
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
#ifdef __CUDA_ARCH__
return (__umulhi(magic, n) >> shift);
-#elif defined(__SYCL_DEVICE_ONLY__)
- return (cl::sycl::mul_hi(static_cast<uint64_t>(magic), static_cast<uint64_t>(n)) >> shift);
#else
uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n);
return (static_cast<uint32_t>(v >> 32) >> shift);
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
index f92e39d..ee0078b 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
@@ -51,12 +51,4 @@
#endif
-#if EIGEN_OS_WIN || EIGEN_OS_WIN64
-#define EIGEN_SLEEP(n) Sleep(n)
-#elif EIGEN_OS_GNULINUX
-#define EIGEN_SLEEP(n) usleep(n * 1000);
-#else
-#define EIGEN_SLEEP(n) sleep(std::max<unsigned>(1, n/1000))
-#endif
-
#endif
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index b5ef31d..615559d 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -75,7 +75,6 @@ struct PacketType<half, GpuDevice> {
HasSqrt = 1,
HasRsqrt = 1,
HasExp = 1,
- HasExpm1 = 0,
HasLog = 1,
HasLog1p = 0,
HasLog10 = 0,
@@ -169,12 +168,12 @@ template <typename Idx> struct IndexPair {
#ifdef EIGEN_HAS_SFINAE
namespace internal {
- template<typename IndexType, typename Index, Index... Is>
+ template<typename IndexType, Index... Is>
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
array<Index, sizeof...(Is)> customIndices2Array(IndexType& idx, numeric_list<Index, Is...>) {
return { idx[Is]... };
}
- template<typename IndexType, typename Index>
+ template<typename IndexType>
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
array<Index, 0> customIndices2Array(IndexType&, numeric_list<Index>) {
return array<Index, 0>();
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 6ddd2ca..d34f1e3 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -299,16 +299,6 @@ template <typename Index> struct MemcpyTriggerForSlicing<Index, GpuDevice> {
EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; }
};
#endif
-
-// It is very expensive to start the memcpy kernel on GPU: we therefore only
-// use it for large copies.
-#ifdef EIGEN_USE_SYCL
-template <typename Index> struct MemcpyTriggerForSlicing<Index, const Eigen::SyclDevice> {
- EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) { }
- EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; }
-};
-#endif
-
}
// Eval as rvalue
@@ -503,14 +493,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
}
return NULL;
}
- /// used by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const{
- return m_impl;
- }
- /// used by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& startIndices() const{
- return m_offsets;
- }
+
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
{
@@ -711,12 +694,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
{
typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
static const int NumDims = internal::array_size<Strides>::value;
- typedef typename XprType::Index Index;
- typedef typename XprType::Scalar Scalar;
- typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
- typedef Strides Dimensions;
enum {
// Alignment can't be guaranteed at compile time since it depends on the
@@ -729,7 +706,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
- : m_impl(op.expression(), device), m_device(device), m_strides(op.strides()), m_exprStartIndices(op.startIndices()), m_exprStopIndices(op.stopIndices())
+ : m_impl(op.expression(), device), m_device(device), m_strides(op.strides())
{
// Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero
DSizes<Index,NumDims> startIndicesClamped, stopIndicesClamped;
@@ -739,7 +716,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]);
}else{
- /* implies m_strides[i]<0 by assert */
+ /* implies m_strides[i]<0 by assert */
startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1);
stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1);
}
@@ -802,6 +779,13 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
sizeof(Scalar));
}
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef Strides Dimensions;
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -827,15 +811,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
return NULL;
}
- //use by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& exprStartIndices() const { return m_exprStartIndices; }
- //use by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& exprStopIndices() const { return m_exprStopIndices; }
- //use by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& strides() const { return m_strides; }
- /// used by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const{return m_impl;}
-
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
{
@@ -857,11 +832,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
}
static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
-#ifndef __SYCL_DEVICE_ONLY__
return numext::maxi(min, numext::mini(max,value));
-#else
- return cl::sycl::clamp(value, min, max);
-#endif
}
array<Index, NumDims> m_outputStrides;
@@ -874,10 +845,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
DSizes<Index, NumDims> m_offsets; // offset in a flattened shape
const Strides m_strides;
std::size_t m_block_total_size_max;
- //use by sycl
- const StartIndices m_exprStartIndices;
- //use by sycl
- const StopIndices m_exprStopIndices;
};
// Eval as lvalue
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index a8e2552..647bcf1 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -200,13 +200,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
- /// used by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PaddingDimensions& padding() const { return m_padding; }
- /// used by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& padding_value() const { return m_paddingValue; }
- /// used by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const{return m_impl;}
-
private:
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim(
Index index, int dim_index) const {
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index e341e2e..41d0d00 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -11,20 +11,8 @@
#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
-// clang is incompatible with the CUDA syntax wrt making a kernel a class friend,
-// so we'll use a macro to make clang happy.
-#ifndef KERNEL_FRIEND
-#if defined(__clang__) && defined(__CUDA__)
-#define KERNEL_FRIEND friend __global__
-#else
-#define KERNEL_FRIEND friend
-#endif
-#endif
-
-
namespace Eigen {
-
/** \class TensorReduction
* \ingroup CXX11_Tensor_Module
*
@@ -692,23 +680,17 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
#endif
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
- template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
+ template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
#ifdef EIGEN_HAS_CUDA_FP16
- template <typename S, typename R, typename I> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
- template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
- template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
-#endif
- template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
-
- template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+ template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
+ template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
+ template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
#endif
+ template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
-#if defined(EIGEN_USE_SYCL)
- template < typename HostExpr_, typename FunctorExpr_, typename Tuple_of_Acc_, typename Dims_, typename Op_, typename Index_> friend class TensorSycl::internal::ReductionFunctor;
- template<typename CoeffReturnType_ ,typename OutAccessor_, typename HostExpr_, typename FunctorExpr_, typename Op_, typename Dims_, typename Index_, typename TupleType_> friend class TensorSycl::internal::FullReductionKernelFunctor;
+ template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
#endif
-
template <typename S, typename O, typename D> friend struct internal::InnerReducer;
// Returns the Index in the input tensor of the first value that needs to be
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index edb0ab2..65638b6 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -287,6 +287,7 @@ struct FullReductionLauncher<
void>::type> {
static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
typedef typename Self::Index Index;
+ typedef typename Self::CoeffReturnType Scalar;
const int block_size = 256;
const int num_per_thread = 128;
const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
index c3ca129..3daecb0 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@@ -25,28 +25,61 @@
namespace Eigen {
namespace internal {
-template<typename OP, typename CoeffReturnType> struct syclGenericBufferReducer{
+template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{
template<typename BufferTOut, typename BufferTIn>
-static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
+static void run(BufferTOut* bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
do {
- auto f = [length, local, op, &bufOut, &bufI](cl::sycl::handler& h) mutable {
+ auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable {
cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
cl::sycl::range<1>{std::min(length, local)}};
/* Two accessors are used: one to the buffer that is being reduced,
* and a second to local memory, used to store intermediate data. */
- auto aI =bufI.template get_access<cl::sycl::access::mode::read_write>(h);
- auto aOut =bufOut.template get_access<cl::sycl::access::mode::discard_write>(h);
- typedef decltype(aI) InputAccessor;
- typedef decltype(aOut) OutputAccessor;
- typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,cl::sycl::access::target::local> LocalAccessor;
- LocalAccessor scratch(cl::sycl::range<1>(local), h);
+ auto aI =
+ bufI.template get_access<cl::sycl::access::mode::read_write>(h);
+ auto aOut =
+ bufOut->template get_access<cl::sycl::access::mode::discard_write>(h);
+ cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,
+ cl::sycl::access::target::local>
+ scratch(cl::sycl::range<1>(local), h);
/* The parallel_for invocation chosen is the variant with an nd_item
* parameter, since the code requires barriers for correctness. */
- h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, aI, scratch, length, local));
+ h.parallel_for<KernelName>(
+ r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) {
+ size_t globalid = id.get_global(0);
+ size_t localid = id.get_local(0);
+ /* All threads collectively read from global memory into local.
+ * The barrier ensures all threads' IO is resolved before
+ * execution continues (strictly speaking, all threads within
+ * a single work-group - there is no co-ordination between
+ * work-groups, only work-items). */
+ if (globalid < length) {
+ scratch[localid] = aI[globalid];
+ }
+ id.barrier(cl::sycl::access::fence_space::local_space);
+
+ /* Apply the reduction operation between the current local
+ * id and the one on the other half of the vector. */
+ if (globalid < length) {
+ int min = (length < local) ? length : local;
+ for (size_t offset = min / 2; offset > 0; offset /= 2) {
+ if (localid < offset) {
+ scratch[localid] += scratch[localid + offset];
+ }
+ id.barrier(cl::sycl::access::fence_space::local_space);
+ }
+ /* The final result will be stored in local id 0. */
+ if (localid == 0) {
+ aI[id.get_group(0)] = scratch[localid];
+ if((length<=local) && globalid ==0){
+ aOut[globalid]=scratch[localid];
+ }
+ }
+ }
+ });
};
- dev.sycl_queue().submit(f);
- dev.asynchronousExec();
+ dev.m_queue.submit(f);
+ dev.m_queue.throw_asynchronous();
/* At this point, you could queue::wait_and_throw() to ensure that
* errors are caught quickly. However, this would likely impact
@@ -54,23 +87,18 @@ static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDev
length = length / local;
} while (length > 1);
-}
-};
-template<typename CoeffReturnType> struct syclGenericBufferReducer<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType>{
-template<typename BufferTOut, typename BufferTIn>
-static void run(Eigen::internal::MeanReducer<CoeffReturnType>, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
- syclGenericBufferReducer<Eigen::internal::SumReducer<CoeffReturnType>, CoeffReturnType>::run(Eigen::internal::SumReducer<CoeffReturnType>(),
- bufOut, bufI, dev, length, local);
+
}
+
};
+/// For now let's start with a full reducer
/// Self is useless here because in expression construction we are going to treat reduction as a leafnode.
/// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the
/// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as
// a leafNode.
-
template <typename Self, typename Op, bool Vectorizable>
struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
@@ -79,8 +107,8 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
- typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr;
- FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl());
+ typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
+ auto functors = TensorSycl::internal::extractFunctors(self.impl());
int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread.
size_t inputSize =self.impl().dimensions().TotalSize();
size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input
@@ -88,7 +116,7 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
if(rng ==0) {
red_factor=1;
};
- size_t tileSize =dev.sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
+ size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
size_t GRange=std::max((size_t )1, rng);
// convert global range to power of 2 for redecution
@@ -105,66 +133,105 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
size_t outTileSize = tileSize;
/// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one.
if (GRange < outTileSize) outTileSize=GRange;
+ // getting final out buffer at the moment the created buffer is true because there is no need for assign
+ auto out_buffer =dev.template get_sycl_buffer<typename Eigen::internal::remove_all<CoeffReturnType>::type>(self.dimensions().TotalSize(), output);
/// creating the shared memory for calculating reduction.
/// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
/// recursively apply reduction on it in order to reduce the whole.
auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
- // Dims dims= self.xprDims();
- //Op functor = reducer;
- dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
- // this is a workaround for gcc 4.8 bug
- typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) TupleType;
+ Dims dims= self.xprDims();
+ Op functor = reducer;
+ dev.m_queue.submit([&](cl::sycl::handler &cgh) {
// create a tuple of accessors from Evaluator
- TupleType tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
+ auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
- typedef decltype(tmp_global_accessor) OutAccessor;
- cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)),
- TensorSycl::internal::FullReductionKernelFunctor<CoeffReturnType, OutAccessor, HostExpr, FunctorExpr, Op, Dims, size_t, TupleType>
- (tmp_global_accessor, rng, remaining, red_factor, reducer, self.xprDims(), functors, tuple_of_accessors));
+
+ cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) {
+ typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
+ auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+ /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
+ /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
+ /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
+ const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
+ /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
+ /// the device_evaluator is detectable and recognisable on the device.
+ auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
+ /// const cast added as a naive solution to solve the qualifier drop error
+ auto globalid=itemID.get_global_linear_id();
+
+ if(globalid<rng)
+ tmp_global_accessor.get_pointer()[globalid]=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*globalid, red_factor, const_cast<Op&>(functor));
+ else
+ tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);
+
+ if(remaining!=0 && globalid==0 )
+ // this will add the rest of input buffer when the input size is not devidable to red_factor.
+ tmp_global_accessor.get_pointer()[globalid]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*(rng), remaining, const_cast<Op&>(functor));
+ });
});
- dev.asynchronousExec();
+ dev.m_queue.throw_asynchronous();
- // getting final out buffer at the moment the created buffer is true because there is no need for assign
- auto out_buffer =dev.get_sycl_buffer(output);
- /// This is used to recursively reduce the tmp value to an element of 1;
- syclGenericBufferReducer<Op, CoeffReturnType>::run(reducer, out_buffer, temp_global_buffer,dev, GRange, outTileSize);
+/// This is used to recursively reduce the tmp value to an element of 1;
+ syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize);
}
};
-
template <typename Self, typename Op>
struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
typedef typename Self::CoeffReturnType CoeffReturnType;
static const bool HasOptimizedImplementation = false;
- static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index num_values_to_reduce, typename Self::Index num_coeffs_to_preserve) {
+ static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) {
typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
- typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr;
- FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl());
- typename Self::Index range, GRange, tileSize;
- typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
+ typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
+ auto functors = TensorSycl::internal::extractFunctors(self.impl());
+
+ size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
+ size_t GRange=num_coeffs_to_preserve;
+ if (tileSize>GRange) tileSize=GRange;
+ else if(GRange>tileSize){
+ size_t xMode = GRange % tileSize;
+ if (xMode != 0) GRange += (tileSize - xMode);
+ }
// getting final out buffer at the moment the created buffer is true because there is no need for assign
/// creating the shared memory for calculating reduction.
/// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
/// recursively apply reduction on it in order to reduce the whole.
- dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
- dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
- // this is workaround for gcc 4.8 bug.
- typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc;
- // create a tuple of accessors from Evaluator
- Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
- auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output);
- Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast<Index>(1);
- cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
- TensorSycl::internal::ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index>
- (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size));
+ typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
+ Dims dims= self.xprDims();
+ Op functor = reducer;
+ dev.m_queue.submit([&](cl::sycl::handler &cgh) {
+ // create a tuple of accessors from Evaluator
+ auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
+ auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(num_coeffs_to_preserve,cgh, output);
+
+ cgh.parallel_for<Self>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
+ typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
+ auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+ /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
+ /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
+ /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
+ const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
+ /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
+ /// the device_evaluator is detectable and recognisable on the device.
+ typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeiceSelf;
+ auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
+ /// const cast added as a naive solution to solve the qualifier drop error
+ auto globalid=itemID.get_global_linear_id();
+ if (globalid< static_cast<size_t>(num_coeffs_to_preserve)) {
+ typename DeiceSelf::CoeffReturnType accum = functor.initialize();
+ GenericDimReducer<DeiceSelf::NumReducedDims-1, DeiceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(globalid),const_cast<Op&>(functor), &accum);
+ functor.finalize(accum);
+ output_accessor.get_pointer()[globalid]= accum;
+ }
+ });
});
- dev.asynchronousExec();
+ dev.m_queue.throw_asynchronous();
return false;
}
};
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index e430b08..14e392e 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -224,11 +224,6 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
- /// required by sycl in order to extract the accessor
- const TensorEvaluator<ArgType, Device> & impl() const { return m_impl; }
- /// added for sycl in order to construct the buffer from sycl device
- ReverseDimensions functor() const { return m_reverse; }
-
protected:
Dimensions m_dimensions;
array<Index, NumDims> m_strides;
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index edc9dd3..113c060 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -117,7 +117,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
- : m_impl(op.expression(), device), m_shuffle(op.shufflePermutation())
+ : m_impl(op.expression(), device)
{
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
const Shuffle& shuffle = op.shufflePermutation();
@@ -187,11 +187,6 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
- // required by sycl
- EIGEN_STRONG_INLINE const Shuffle& shufflePermutation() const {return m_shuffle;}
- // required by sycl
- EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const {return m_impl;}
-
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
Index inputIndex = 0;
@@ -211,12 +206,11 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
return inputIndex + index * m_inputStrides[NumDims - 1];
}
}
+
Dimensions m_dimensions;
array<Index, NumDims> m_outputStrides;
array<Index, NumDims> m_inputStrides;
TensorEvaluator<ArgType, Device> m_impl;
- /// required by sycl
- Shuffle m_shuffle;
};
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
index e6a666f..2854a4a 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@@ -31,12 +31,12 @@ namespace Eigen {
*
* \sa Tensor
*/
-template<typename T, typename Dimensions, int Options> class TensorStorage;
+template<typename T, typename Dimensions, int Options_> class TensorStorage;
// Pure fixed-size storage
-template<typename T, typename FixedDimensions, int Options_>
-class TensorStorage
+template<typename T, int Options_, typename FixedDimensions>
+class TensorStorage<T, FixedDimensions, Options_>
{
private:
static const std::size_t Size = FixedDimensions::total_size;
@@ -66,7 +66,7 @@ class TensorStorage
// pure dynamic
-template<typename T, typename IndexType, int NumIndices_, int Options_>
+template<typename T, int Options_, typename IndexType, int NumIndices_>
class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
{
public:
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 2237140..6c35bfd 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -117,11 +117,11 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
- : m_impl(op.expression(), device), m_strides(op.strides())
+ : m_impl(op.expression(), device)
{
m_dimensions = m_impl.dimensions();
for (int i = 0; i < NumDims; ++i) {
- m_dimensions[i] =Eigen::numext::ceil(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
+ m_dimensions[i] = ceilf(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
}
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
@@ -224,11 +224,6 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
- /// required by sycl in order to extract the accessor
- const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
- /// required by sycl in order to extract the accessor
- Strides functor() const { return m_strides; }
-
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
{
@@ -255,9 +250,9 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
array<Index, NumDims> m_outputStrides;
array<Index, NumDims> m_inputStrides;
TensorEvaluator<ArgType, Device> m_impl;
- const Strides m_strides;
};
+
// Eval as lvalue
template<typename Strides, typename ArgType, typename Device>
struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
@@ -291,11 +286,6 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
return this->m_impl.coeffRef(this->srcCoeff(index));
}
- /// required by sycl in order to extract the accessor
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return this->m_impl; }
- /// required by sycl in order to extract the accessor
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Strides functor() const { return this->m_strides; }
-
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
{
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
index 9d5a6d4..bb8800d 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
@@ -20,14 +20,12 @@
template <class T>
struct MakeGlobalPointer {
typedef typename cl::sycl::global_ptr<T>::pointer_t Type;
- typedef typename cl::sycl::global_ptr<T>::reference_t RefType;
};
// global pointer to set different attribute state for a class
template <class T>
struct MakeLocalPointer {
typedef typename cl::sycl::local_ptr<T>::pointer_t Type;
- typedef typename cl::sycl::local_ptr<T>::reference_t RefType;
};
@@ -35,9 +33,6 @@ namespace Eigen {
namespace TensorSycl {
namespace internal {
- template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer;
-
-
/// This struct is used for special expression nodes with no operations (for example assign and selectOP).
struct NoOP;
@@ -80,15 +75,8 @@ template<typename T> struct GetType<false, T>{
/// this is used for extracting tensor reduction
#include "TensorReductionSycl.h"
-/// this is used for extracting tensor convolution
-#include "TensorConvolutionSycl.h"
-
// kernel execution using fusion
#include "TensorSyclRun.h"
-//sycl functors
-#include "TensorSyclFunctors.h"
-
-#include "TensorContractionSycl.h"
#endif // end of EIGEN_USE_SYCL
#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
index ee8f3c9..8729c86 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
@@ -48,9 +48,9 @@ struct DeviceConvertor{
/// specialisation of the \ref ConvertToDeviceExpression struct when the node
/// type is TensorMap
#define TENSORMAPCONVERT(CVQual)\
-template <typename T, int Options_, template <class> class MakePointer_>\
-struct ConvertToDeviceExpression<CVQual TensorMap<T, Options_, MakePointer_> > {\
- typedef CVQual TensorMap<T, Options_, MakeGlobalPointer> Type;\
+template <typename Scalar_, int Options_, int Options2_, int NumIndices_, typename IndexType_, template <class> class MakePointer_>\
+struct ConvertToDeviceExpression<CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakePointer_> > {\
+ typedef CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakeGlobalPointer> Type;\
};
TENSORMAPCONVERT(const)
@@ -97,18 +97,8 @@ template <typename Expr>\
struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > \
: DeviceConvertor<ExprNode, Res, Expr>{};
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorForcedEvalOp
-#define KERNELBROKERCONVERTFORCEDEVAL(CVQual)\
-template <typename Expr>\
-struct ConvertToDeviceExpression<CVQual TensorForcedEvalOp<Expr> > {\
- typedef CVQual TensorForcedEvalOp< typename ConvertToDeviceExpression<Expr>::Type> Type;\
-};
-KERNELBROKERCONVERTFORCEDEVAL(const)
-KERNELBROKERCONVERTFORCEDEVAL()
-#undef KERNELBROKERCONVERTFORCEDEVAL
-
-
-
+KERNELBROKERCONVERT(const, true, TensorForcedEvalOp)
+KERNELBROKERCONVERT(, false, TensorForcedEvalOp)
KERNELBROKERCONVERT(const, true, TensorEvalToOp)
KERNELBROKERCONVERT(, false, TensorEvalToOp)
#undef KERNELBROKERCONVERT
@@ -124,40 +114,6 @@ KERNELBROKERCONVERTREDUCTION(const)
KERNELBROKERCONVERTREDUCTION()
#undef KERNELBROKERCONVERTREDUCTION
-#define KERNELBROKERCONVERTSLICEOP(CVQual)\
-template<typename StartIndices, typename Sizes, typename XprType>\
-struct ConvertToDeviceExpression<CVQual TensorSlicingOp <StartIndices, Sizes, XprType> >{\
- typedef CVQual TensorSlicingOp<StartIndices, Sizes, typename ConvertToDeviceExpression<XprType>::Type> Type;\
-};
-
-KERNELBROKERCONVERTSLICEOP(const)
-KERNELBROKERCONVERTSLICEOP()
-#undef KERNELBROKERCONVERTSLICEOP
-
-
-#define KERNELBROKERCONVERTERSLICESTRIDEOP(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>\
-struct ConvertToDeviceExpression<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >{\
- typedef CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, typename ConvertToDeviceExpression<XprType>::Type> Type;\
-};
-
-KERNELBROKERCONVERTERSLICESTRIDEOP(const)
-KERNELBROKERCONVERTERSLICESTRIDEOP()
-#undef KERNELBROKERCONVERTERSLICESTRIDEOP
-
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorChippingOp
-#define KERNELBROKERCONVERTCHIPPINGOP(CVQual)\
-template <DenseIndex DimId, typename Expr>\
-struct ConvertToDeviceExpression<CVQual TensorChippingOp<DimId, Expr> > {\
- typedef CVQual TensorChippingOp<DimId, typename ConvertToDeviceExpression<Expr>::Type> Type;\
-};
-KERNELBROKERCONVERTCHIPPINGOP(const)
-KERNELBROKERCONVERTCHIPPINGOP()
-#undef KERNELBROKERCONVERTCHIPPINGOP
-
-
-
} // namespace internal
} // namespace TensorSycl
} // namespace Eigen
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
index 3b83b1d..7ed3a3a 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
@@ -25,21 +25,12 @@
namespace Eigen {
namespace TensorSycl {
namespace internal {
-
-template <typename Expr, typename Dims>
-struct DeviceFixedSizeTensor;
-
-template <typename Expr, typename std::ptrdiff_t... Indices>
-struct DeviceFixedSizeTensor<Expr, Eigen::Sizes<Indices...>>{
- template<typename Data>
- static EIGEN_ALWAYS_INLINE Expr instantiate(Data& dt) {return Expr(ConvertToActualTypeSycl(typename Expr::Scalar, dt), Indices...);}
-};
/// this class is used by EvalToOp in order to create an lhs expression which is
/// a pointer from an accessor on device-only buffer
template <typename PtrType, size_t N, typename... Params>
struct EvalToLHSConstructor {
PtrType expr;
- EvalToLHSConstructor(const utility::tuple::Tuple<Params...> &t) : expr(ConvertToActualTypeSycl(typename Eigen::internal::remove_all<PtrType>::type, utility::tuple::get<N>(t))) {}
+ EvalToLHSConstructor(const utility::tuple::Tuple<Params...> &t): expr((&(*(utility::tuple::get<N>(t).get_pointer())))) {}
};
/// \struct ExprConstructor is used to reconstruct the expression on the device and
@@ -54,39 +45,21 @@ struct ExprConstructor;
/// specialisation of the \ref ExprConstructor struct when the node type is
/// TensorMap
#define TENSORMAP(CVQual)\
-template <typename T, int Options_,\
+template <typename Scalar_, int Options_, int Options2_, int Options3_, int NumIndices_, typename IndexType_,\
template <class> class MakePointer_, size_t N, typename... Params>\
-struct ExprConstructor< CVQual TensorMap<T, Options_, MakeGlobalPointer>,\
-CVQual PlaceHolder<CVQual TensorMap<T, Options_, MakePointer_>, N>, Params...>{\
- typedef CVQual TensorMap<T, Options_, MakeGlobalPointer> Type;\
+struct ExprConstructor< CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakeGlobalPointer>,\
+CVQual PlaceHolder<CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options3_, MakePointer_>, N>, Params...>{\
+ typedef CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakeGlobalPointer> Type;\
Type expr;\
template <typename FuncDetector>\
ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
- : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())){}\
+ : expr(Type((&(*(utility::tuple::get<N>(t).get_pointer()))), fd.dimensions())) {}\
};
-
TENSORMAP(const)
TENSORMAP()
#undef TENSORMAP
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorMap
-#define TENSORMAPFIXEDSIZE(CVQual)\
-template <typename Scalar_, typename Dimensions_, int Options_2, typename IndexType, int Options_,\
-template <class> class MakePointer_, size_t N, typename... Params>\
-struct ExprConstructor< CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakeGlobalPointer>,\
-CVQual PlaceHolder<CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_>, N>, Params...>{\
- typedef CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakeGlobalPointer> Type;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &, const utility::tuple::Tuple<Params...> &t)\
- : expr(DeviceFixedSizeTensor<Type,Dimensions_>::instantiate(utility::tuple::get<N>(t))){}\
-};
-TENSORMAPFIXEDSIZE(const)
-TENSORMAPFIXEDSIZE()
-#undef TENSORMAPFIXEDSIZE
-
#define UNARYCATEGORY(CVQual)\
template <template<class, class> class UnaryCategory, typename OP, typename OrigRHSExpr, typename RHSExpr, typename... Params>\
struct ExprConstructor<CVQual UnaryCategory<OP, OrigRHSExpr>, CVQual UnaryCategory<OP, RHSExpr>, Params...> {\
@@ -188,30 +161,8 @@ struct ExprConstructor<CVQual TensorAssignOp<OrigLHSExpr, OrigRHSExpr>, CVQual
ASSIGN(const)
ASSIGN()
#undef ASSIGN
-
-
-
-
- /// specialisation of the \ref ExprConstructor struct when the node type is
- /// const TensorAssignOp
- #define CONVERSIONEXPRCONST(CVQual)\
- template <typename OrigNestedExpr, typename ConvertType, typename NestedExpr, typename... Params>\
- struct ExprConstructor<CVQual TensorConversionOp<ConvertType, OrigNestedExpr>, CVQual TensorConversionOp<ConvertType, NestedExpr>, Params...> {\
- typedef ExprConstructor<OrigNestedExpr, NestedExpr, Params...> my_nested_type;\
- typedef CVQual TensorConversionOp<ConvertType, typename my_nested_type::Type> Type;\
- my_nested_type nestedExpr;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
- : nestedExpr(funcD.subExpr, t), expr(nestedExpr.expr) {}\
- };
-
- CONVERSIONEXPRCONST(const)
- CONVERSIONEXPRCONST()
- #undef CONVERSIONEXPRCONST
-
/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorEvalToOp /// 0 here is the output number in the buffer
+/// TensorEvalToOp
#define EVALTO(CVQual)\
template <typename OrigExpr, typename Expr, typename... Params>\
struct ExprConstructor<CVQual TensorEvalToOp<OrigExpr, MakeGlobalPointer>, CVQual TensorEvalToOp<Expr>, Params...> {\
@@ -234,14 +185,14 @@ EVALTO()
/// TensorForcedEvalOp
#define FORCEDEVAL(CVQual)\
template <typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
-struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr>,\
+struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr, MakeGlobalPointer>,\
CVQual PlaceHolder<CVQual TensorForcedEvalOp<DevExpr>, N>, Params...> {\
- typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr>::Scalar,\
- TensorForcedEvalOp<DevExpr>::NumDimensions, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, typename TensorForcedEvalOp<DevExpr>::Index>, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, MakeGlobalPointer> Type;\
+ typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::Scalar,\
+ TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::NumDimensions, 0, typename TensorForcedEvalOp<DevExpr>::Index>, 0, MakeGlobalPointer> Type;\
Type expr;\
template <typename FuncDetector>\
ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
- : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
+ : expr(Type((&(*(utility::tuple::get<N>(t).get_pointer()))), fd.dimensions())) {}\
};
FORCEDEVAL(const)
@@ -262,130 +213,17 @@ struct ExprConstructor<CVQual TensorReductionOp<OP, Dim, OrigExpr, MakeGlobalPoi
CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dim, DevExpr>, N>, Params...> {\
static const size_t NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0, 1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\
typedef CVQual TensorMap<Tensor<typename TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::Scalar,\
- NumIndices, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, typename TensorReductionOp<OP, Dim, DevExpr>::Index>, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, MakeGlobalPointer> Type;\
+ NumIndices, 0, typename TensorReductionOp<OP, Dim, DevExpr>::Index>, 0, MakeGlobalPointer> Type;\
Type expr;\
template <typename FuncDetector>\
ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
- :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
+ : expr(Type((&(*(utility::tuple::get<N>(t).get_pointer()))), fd.dimensions())) {}\
};
SYCLREDUCTIONEXPR(const)
SYCLREDUCTIONEXPR()
#undef SYCLREDUCTIONEXPR
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorContractionOp
-#define SYCLCONTRACTIONCONVOLUTION(CVQual, ExprNode)\
-template <typename Indices, typename OrigLhsXprType, typename OrigRhsXprType, typename LhsXprType, typename RhsXprType, size_t N, typename... Params>\
-struct ExprConstructor<CVQual ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>,\
-CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>, Params...> {\
- static const size_t NumIndices= Eigen::internal::traits<ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> >::NumDimensions;\
- typedef CVQual TensorMap<Tensor<typename ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>::Scalar,\
- NumIndices, Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType> >::Layout,\
- typename ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>::Index>,\
- Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>>::Layout, MakeGlobalPointer> Type;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
- :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
-};
-
-SYCLCONTRACTIONCONVOLUTION(const, TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTION(, TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTION(const, TensorConvolutionOp)
-SYCLCONTRACTIONCONVOLUTION(, TensorConvolutionOp)
-#undef SYCLCONTRACTIONCONVOLUTION
-
-
-
-#define SYCLSLICEOPEXPR(CVQual)\
-template<typename StartIndices, typename Sizes, typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual TensorSlicingOp <StartIndices, Sizes, OrigXprType> , CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Params... >{\
- typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
- typedef CVQual TensorSlicingOp<StartIndices, Sizes, typename my_xpr_type::Type> Type;\
- my_xpr_type xprExpr;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
- : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.startIndices(), funcD.dimensions()) {}\
-};
-
-SYCLSLICEOPEXPR(const)
-SYCLSLICEOPEXPR()
-#undef SYCLSLICEOPEXPR
-
-
-#define SYCLSLICESTRIDEOPEXPR(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, OrigXprType>, CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Params... >{\
- typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
- typedef CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, typename my_xpr_type::Type> Type;\
- my_xpr_type xprExpr;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
- : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.startIndices(), funcD.stopIndices(),funcD.strides()) {}\
-};
-
-SYCLSLICESTRIDEOPEXPR(const)
-SYCLSLICESTRIDEOPEXPR()
-#undef SYCLSLICESTRIDEOPEXPR
-
-#define SYCLRESHAPEANDSHUFFLEOPEXPRCONST(OPEXPR, CVQual)\
-template<typename Param, typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\
- typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
- typedef CVQual OPEXPR <Param, typename my_xpr_type::Type> Type ;\
- my_xpr_type xprExpr;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
- : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param()) {}\
-};
-
-SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, const)
-SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, )
-
-SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, const)
-SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, )
-#undef SYCLRESHAPEANDSHUFFLEOPEXPRCONST
-
-#define SYCLPADDINGOPEXPRCONST(OPEXPR, CVQual)\
-template<typename Param, typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\
- typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
- typedef CVQual OPEXPR <Param, typename my_xpr_type::Type> Type ;\
- my_xpr_type xprExpr;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
- : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param() , funcD.scalar_param()) {}\
-};
-
-SYCLPADDINGOPEXPRCONST(TensorPaddingOp, const)
-SYCLPADDINGOPEXPRCONST(TensorPaddingOp, )
-#undef SYCLPADDINGOPEXPRCONST
-
-
-// TensorChippingOp
-#define SYCLTENSORCHIPPINGOPEXPR(CVQual)\
-template<DenseIndex DimId, typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual TensorChippingOp <DimId, OrigXprType> , CVQual TensorChippingOp<DimId, XprType>, Params... >{\
- typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
- typedef CVQual TensorChippingOp<DimId, typename my_xpr_type::Type> Type;\
- my_xpr_type xprExpr;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
- : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.offset(), funcD.dimId()) {}\
-};
-
-SYCLTENSORCHIPPINGOPEXPR(const)
-SYCLTENSORCHIPPINGOPEXPR()
-#undef SYCLTENSORCHIPPINGOPEXPR
-
-
/// template deduction for \ref ExprConstructor struct
template <typename OrigExpr, typename IndexExpr, typename FuncD, typename... Params>
auto createDeviceExpression(FuncD &funcD, const utility::tuple::Tuple<Params...> &t)
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
index b512d43..b1da685 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
@@ -35,8 +35,6 @@
namespace Eigen {
namespace TensorSycl {
namespace internal {
-#define RETURN_CPP11(expr) ->decltype(expr) {return expr;}
-
/// \struct ExtractAccessor: Extract Accessor Class is used to extract the
/// accessor from a buffer.
/// Depending on the type of the leaf node we can get a read accessor or a
@@ -45,192 +43,159 @@ template <typename Evaluator>
struct ExtractAccessor;
struct AccessorConstructor{
- template<typename Arg> static inline auto getTuple(cl::sycl::handler& cgh, const Arg& eval)
- RETURN_CPP11(ExtractAccessor<Arg>::getTuple(cgh, eval))
-
- template<typename Arg1, typename Arg2> static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1, const Arg2& eval2)
- RETURN_CPP11(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2)))
-
- template<typename Arg1, typename Arg2, typename Arg3> static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1 , const Arg2& eval2 , const Arg3& eval3)
- RETURN_CPP11(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3))))
-
- template< cl::sycl::access::mode AcM, typename Arg> static inline auto getAccessor(cl::sycl::handler& cgh, const Arg& eval)
- RETURN_CPP11(utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM>(cgh,eval.data())))
+ template<typename Arg> static inline auto getTuple(cl::sycl::handler& cgh, Arg eval)
+ -> decltype(ExtractAccessor<Arg>::getTuple(cgh, eval)) {
+ return ExtractAccessor<Arg>::getTuple(cgh, eval);
+ }
+
+ template<typename Arg1, typename Arg2> static inline auto getTuple(cl::sycl::handler& cgh, Arg1 eval1, Arg2 eval2)
+ -> decltype(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2))) {
+ return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2));
+ }
+ template<typename Arg1, typename Arg2, typename Arg3> static inline auto getTuple(cl::sycl::handler& cgh, Arg1 eval1 , Arg2 eval2 , Arg3 eval3)
+ -> decltype(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)))) {
+ return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)));
+ }
+ template< cl::sycl::access::mode AcM, typename Arg> static inline auto getAccessor(cl::sycl::handler& cgh, Arg eval)
+ -> decltype(utility::tuple::make_tuple( eval.device().template get_sycl_accessor<AcM,
+ typename Eigen::internal::remove_all<typename Arg::CoeffReturnType>::type>(eval.dimensions().TotalSize(), cgh,eval.data()))){
+ return utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM, typename Eigen::internal::remove_all<typename Arg::CoeffReturnType>::type>(eval.dimensions().TotalSize(), cgh,eval.data()));
+ }
};
/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// TensorCwiseNullaryOp, TensorCwiseUnaryOp and TensorBroadcastingOp
-#define SYCLUNARYCATEGORYEXTACC(CVQual)\
-template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev>& eval)\
-RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
+/// const TensorCwiseNullaryOp, const TensorCwiseUnaryOp and const TensorBroadcastingOp
+template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> > {
+ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> eval)
+ -> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){
+ return AccessorConstructor::getTuple(cgh, eval.impl());
+ }
};
-SYCLUNARYCATEGORYEXTACC(const)
-SYCLUNARYCATEGORYEXTACC()
-#undef SYCLUNARYCATEGORYEXTACC
-
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseBinaryOp
-#define SYCLBINARYCATEGORYEXTACC(CVQual)\
-template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseNullaryOp, TensorCwiseUnaryOp and TensorBroadcastingOp
+template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<UnaryCategory<OP, RHSExpr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> > {};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorCwiseBinaryOp
+template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {
+ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> eval)
+ -> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){
+ return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl());
+ }
};
-
-SYCLBINARYCATEGORYEXTACC(const)
-SYCLBINARYCATEGORYEXTACC()
-#undef SYCLBINARYCATEGORYEXTACC
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseBinaryOp
+template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> >{};
/// specialisation of the \ref ExtractAccessor struct when the node type is
/// const TensorCwiseTernaryOp
-#define SYCLTERNARYCATEGORYEXTACC(CVQual)\
-template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl()))\
+template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {
+ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> eval)
+ -> decltype(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl())){
+ return AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl());
+ }
};
-SYCLTERNARYCATEGORYEXTACC(const)
-SYCLTERNARYCATEGORYEXTACC()
-#undef SYCLTERNARYCATEGORYEXTACC
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseTernaryOp
+template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >{};
+/// specialisation of the \ref ExtractAccessor struct when the node type is
+/// const TensorCwiseSelectOp. This is a special case where there is no OP
+template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {
+ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> eval)
+ -> decltype(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl())){
+ return AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl());
+ }
+};
/// specialisation of the \ref ExtractAccessor struct when the node type is
/// TensorCwiseSelectOp. This is a special case where there is no OP
-#define SYCLSELECTOPEXTACC(CVQual)\
-template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl()))\
+template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> >{};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorAssignOp
+template <typename LHSExpr, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {
+ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> eval)
+ -> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){
+ return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl());
+ }
};
-SYCLSELECTOPEXTACC(const)
-SYCLSELECTOPEXTACC()
-#undef SYCLSELECTOPEXTACC
-
/// specialisation of the \ref ExtractAccessor struct when the node type is TensorAssignOp
-#define SYCLTENSORASSIGNOPEXTACC(CVQual)\
-template <typename LHSExpr, typename RHSExpr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\
-};
-
- SYCLTENSORASSIGNOPEXTACC(const)
- SYCLTENSORASSIGNOPEXTACC()
- #undef SYCLTENSORASSIGNOPEXTACC
+template <typename LHSExpr, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TensorAssignOp<LHSExpr, RHSExpr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> >{};
/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorMap
#define TENSORMAPEXPR(CVQual, ACCType)\
template <typename PlainObjectType, int Options_, typename Dev>\
struct ExtractAccessor<TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::template getAccessor<ACCType>(cgh, eval))\
+ static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev> eval)\
+ -> decltype(AccessorConstructor::template getAccessor<ACCType>(cgh, eval)){\
+ return AccessorConstructor::template getAccessor<ACCType>(cgh, eval);\
+ }\
};
-
TENSORMAPEXPR(const, cl::sycl::access::mode::read)
TENSORMAPEXPR(, cl::sycl::access::mode::read_write)
#undef TENSORMAPEXPR
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorForcedEvalOp
-#define SYCLFORCEDEVALEXTACC(CVQual)\
-template <typename Expr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorForcedEvalOp<Expr>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorForcedEvalOp<Expr>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorForcedEvalOp
+template <typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> > {
+ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> eval)
+ -> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){
+ return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);
+ }
};
-SYCLFORCEDEVALEXTACC(const)
-SYCLFORCEDEVALEXTACC()
-#undef SYCLFORCEDEVALEXTACC
-
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorForcedEvalOp
+template <typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TensorForcedEvalOp<Expr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> >{};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorEvalToOp
+template <typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TensorEvalToOp<Expr>, Dev> > {
+ static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<const TensorEvalToOp<Expr>, Dev> eval)
+ -> decltype(utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()))){
+ return utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()));
+ }
+};
/// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp
-#define SYCLEVALTOEXTACC(CVQual)\
-template <typename Expr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorEvalToOp<Expr>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorEvalToOp<Expr>, Dev>& eval)\
- RETURN_CPP11(utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl())))\
+template <typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TensorEvalToOp<Expr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TensorEvalToOp<Expr>, Dev> >{};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorReductionOp
+template <typename OP, typename Dim, typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> > {
+ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> eval)
+ -> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){
+ return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);
+ }
};
-SYCLEVALTOEXTACC(const)
-SYCLEVALTOEXTACC()
-#undef SYCLEVALTOEXTACC
-
/// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp
-#define SYCLREDUCTIONEXTACC(CVQual)\
-template <typename OP, typename Dim, typename Expr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
-};
-
-SYCLREDUCTIONEXTACC(const)
-SYCLREDUCTIONEXTACC()
-#undef SYCLREDUCTIONEXTACC
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorContractionOp and TensorConvolutionOp
-#define SYCLCONTRACTIONCONVOLUTIONEXTACC(CVQual, ExprNode)\
-template<typename Indices, typename LhsXprType, typename RhsXprType, typename Dev>\
- struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
-};
-
-SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorConvolutionOp)
-SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorConvolutionOp)
-#undef SYCLCONTRACTIONCONVOLUTIONEXTACC
-
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// const TensorSlicingOp.
-#define SYCLSLICEOPEXTACC(CVQual)\
-template <typename StartIndices, typename Sizes, typename XprType, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev>& eval)\
- RETURN_CPP11( AccessorConstructor::getTuple(cgh, eval.impl()))\
-};
-
-SYCLSLICEOPEXTACC(const)
-SYCLSLICEOPEXTACC()
-#undef SYCLSLICEOPEXTACC
-// specialisation of the \ref ExtractAccessor struct when the node type is
-/// TensorStridingSlicingOp.
-#define SYCLSLICESTRIDEOPEXTACC(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev> >{\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
-};
-
-SYCLSLICESTRIDEOPEXTACC(const)
-SYCLSLICESTRIDEOPEXTACC()
-#undef SYCLSLICESTRIDEOPEXTACC
-
-// specialisation of the \ref ExtractAccessor struct when the node type is
-/// TensorChippingOp.
-#define SYCLTENSORCHIPPINGOPEXTACC(CVQual)\
-template<DenseIndex DimId, typename XprType, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Dev> >{\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
-};
-
-SYCLTENSORCHIPPINGOPEXTACC(const)
-SYCLTENSORCHIPPINGOPEXTACC()
-#undef SYCLTENSORCHIPPINGOPEXTACC
-
+template <typename OP, typename Dim, typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TensorReductionOp<OP, Dim, Expr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> >{};
/// template deduction for \ref ExtractAccessor
template <typename Evaluator>
-auto createTupleOfAccessors(cl::sycl::handler& cgh, const Evaluator& eval)
--> decltype(ExtractAccessor<Evaluator>::getTuple(cgh, eval)) {
- return ExtractAccessor<Evaluator>::getTuple(cgh, eval);
+auto createTupleOfAccessors(cl::sycl::handler& cgh, const Evaluator& expr)
+-> decltype(ExtractAccessor<Evaluator>::getTuple(cgh, expr)) {
+ return ExtractAccessor<Evaluator>::getTuple(cgh, expr);
}
} /// namespace TensorSycl
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
index ee02018..4271253 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
@@ -36,277 +36,135 @@ namespace internal {
template <typename Evaluator> struct FunctorExtractor{
typedef typename Evaluator::Dimensions Dimensions;
const Dimensions m_dimensions;
- EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+ const Dimensions& dimensions() const { return m_dimensions; }
FunctorExtractor(const Evaluator& expr)
: m_dimensions(expr.dimensions()) {}
};
-/// specialisation of the \ref FunctorExtractor struct when the node type does not require anything
-///TensorConversionOp
-#define SYCLEXTRFUNCCONVERSION(ExprNode, CVQual)\
-template <typename ArgType1, typename ArgType2, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<ArgType1, ArgType2>, Dev> > {\
- FunctorExtractor<TensorEvaluator<ArgType2, Dev> > subExpr;\
- FunctorExtractor(const TensorEvaluator<CVQual ExprNode<ArgType1, ArgType2>, Dev>& expr)\
- : subExpr(expr.impl()) {}\
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// const TensorCwiseNullaryOp, const TensorCwiseUnaryOp, and const TensorBroadcastingOp
+template <template <class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> > {
+ FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
+ OP func;
+ FunctorExtractor(const TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev>& expr)
+ : rhsExpr(expr.impl()), func(expr.functor()) {}
};
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// TensorCwiseNullaryOp, TensorCwiseUnaryOp, and TensorBroadcastingOp
+template <template <class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<UnaryCategory<OP, RHSExpr>, Dev> >
+: FunctorExtractor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> >{};
-SYCLEXTRFUNCCONVERSION(TensorConversionOp, const)
-SYCLEXTRFUNCCONVERSION(TensorConversionOp, )
-#undef SYCLEXTRFUNCCONVERSION
-
-#define SYCLEXTRTENSORMAPFIXEDSIZE(CVQual)\
-template <typename Scalar_, typename Dimensions_, int Options_2, typename IndexType, int Options_, template <class> class MakePointer_, typename Dev>\
-struct FunctorExtractor< TensorEvaluator <CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_> , Dev> >{\
-FunctorExtractor(const TensorEvaluator <CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_> , Dev>& ){}\
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// const TensorCwiseBinaryOp
+template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {
+ FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;
+ FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
+ OP func;
+ FunctorExtractor(const TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& expr)
+ : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.functor()) {}
};
-SYCLEXTRTENSORMAPFIXEDSIZE(const)
-SYCLEXTRTENSORMAPFIXEDSIZE()
-#undef SYCLEXTRTENSORMAPFIXEDSIZE
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// const TensorCwiseBinaryOp
+template <template <class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> >
+: FunctorExtractor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> >{};
/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// TensorCwiseNullaryOp, TensorCwiseUnaryOp, and TensorBroadcastingOp
-#define SYCLEXTRFUNCUNARY(CVQual)\
-template <template <class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev> > {\
- FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
- const OP func;\
- FunctorExtractor(const TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev>& expr)\
- : rhsExpr(expr.impl()), func(expr.functor()) {}\
+/// const TensorCwiseTernaryOp
+template <template <class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr,typename Dev>
+struct FunctorExtractor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {
+ FunctorExtractor<TensorEvaluator<Arg1Expr, Dev> > arg1Expr;
+ FunctorExtractor<TensorEvaluator<Arg2Expr, Dev> > arg2Expr;
+ FunctorExtractor<TensorEvaluator<Arg3Expr, Dev> > arg3Expr;
+ OP func;
+ FunctorExtractor(const TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& expr)
+ : arg1Expr(expr.arg1Impl()), arg2Expr(expr.arg2Impl()), arg3Expr(expr.arg3Impl()), func(expr.functor()) {}
};
-SYCLEXTRFUNCUNARY(const)
-SYCLEXTRFUNCUNARY()
-#undef SYCLEXTRFUNCUNARY
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// TensorCwiseTernaryOp
+template <template <class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>
+struct FunctorExtractor<TensorEvaluator< TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >
+:FunctorExtractor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >{};
/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// TensorCwiseBinaryOp
-#define SYCLEXTRFUNCBIINARY(CVQual)\
-template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {\
- FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\
- FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
- const OP func;\
- FunctorExtractor(const TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& expr)\
- : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.functor()) {}\
+/// const TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated.
+template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
+struct FunctorExtractor< TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {
+ FunctorExtractor<TensorEvaluator<IfExpr, Dev> > ifExpr;
+ FunctorExtractor<TensorEvaluator<ThenExpr, Dev> > thenExpr;
+ FunctorExtractor<TensorEvaluator<ElseExpr, Dev> > elseExpr;
+ FunctorExtractor(const TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& expr)
+ : ifExpr(expr.cond_impl()), thenExpr(expr.then_impl()), elseExpr(expr.else_impl()) {}
};
-SYCLEXTRFUNCBIINARY(const)
-SYCLEXTRFUNCBIINARY()
-#undef SYCLEXTRFUNCBIINARY
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated
+template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> >
+:FunctorExtractor< TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {};
-/// specialisation of the \ref FunctorExtractor struct when the node type is TensorCwiseTernaryOp
-#define SYCLEXTRFUNCTERNARY(CVQual)\
-template <template <class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr,typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {\
- FunctorExtractor<TensorEvaluator<Arg1Expr, Dev> > arg1Expr;\
- FunctorExtractor<TensorEvaluator<Arg2Expr, Dev> > arg2Expr;\
- FunctorExtractor<TensorEvaluator<Arg3Expr, Dev> > arg3Expr;\
- const OP func;\
- FunctorExtractor(const TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& expr)\
- : arg1Expr(expr.arg1Impl()), arg2Expr(expr.arg2Impl()), arg3Expr(expr.arg3Impl()), func(expr.functor()) {}\
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// const TensorAssignOp. This is an specialisation without OP so it has to be separated.
+template <typename LHSExpr, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {
+ FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;
+ FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
+ FunctorExtractor(const TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev>& expr)
+ : lhsExpr(expr.left_impl()), rhsExpr(expr.right_impl()) {}
};
-SYCLEXTRFUNCTERNARY(const)
-SYCLEXTRFUNCTERNARY()
-#undef SYCLEXTRFUNCTERNARY
-
/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated.
-#define SYCLEXTRFUNCSELECTOP(CVQual)\
-template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>\
-struct FunctorExtractor< TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {\
- FunctorExtractor<TensorEvaluator<IfExpr, Dev> > ifExpr;\
- FunctorExtractor<TensorEvaluator<ThenExpr, Dev> > thenExpr;\
- FunctorExtractor<TensorEvaluator<ElseExpr, Dev> > elseExpr;\
- FunctorExtractor(const TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& expr)\
- : ifExpr(expr.cond_impl()), thenExpr(expr.then_impl()), elseExpr(expr.else_impl()) {}\
-};
+/// TensorAssignOp. This is an specialisation without OP so it has to be separated.
+template <typename LHSExpr, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<TensorAssignOp<LHSExpr, RHSExpr>, Dev> >
+:FunctorExtractor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> >{};
-SYCLEXTRFUNCSELECTOP(const)
-SYCLEXTRFUNCSELECTOP()
-#undef SYCLEXTRFUNCSELECTOP
/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// const TensorAssignOp. This is an specialisation without OP so it has to be separated.
-#define SYCLEXTRFUNCASSIGNOP(CVQual)\
-template <typename LHSExpr, typename RHSExpr, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {\
- FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\
- FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
- FunctorExtractor(const TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev>& expr)\
- : lhsExpr(expr.left_impl()), rhsExpr(expr.right_impl()) {}\
+/// const TensorEvalToOp, This is an specialisation without OP so it has to be separated.
+template <typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<const TensorEvalToOp<RHSExpr>, Dev> > {
+ FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
+ FunctorExtractor(const TensorEvaluator<const TensorEvalToOp<RHSExpr>, Dev>& expr)
+ : rhsExpr(expr.impl()) {}
};
-SYCLEXTRFUNCASSIGNOP(const)
-SYCLEXTRFUNCASSIGNOP()
-#undef SYCLEXTRFUNCASSIGNOP
/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// TensorEvalToOp, This is an specialisation without OP so it has to be separated.
-#define SYCLEXTRFUNCEVALTOOP(CVQual)\
-template <typename RHSExpr, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorEvalToOp<RHSExpr>, Dev> > {\
- FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
- FunctorExtractor(const TensorEvaluator<CVQual TensorEvalToOp<RHSExpr>, Dev>& expr)\
- : rhsExpr(expr.impl()) {}\
-};
-
-SYCLEXTRFUNCEVALTOOP(const)
-SYCLEXTRFUNCEVALTOOP()
-#undef SYCLEXTRFUNCEVALTOOP
+/// TensorEvalToOp. This is a specialisation without OP so it has to be separated.
+template <typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<TensorEvalToOp<RHSExpr>, Dev> >
+: FunctorExtractor<TensorEvaluator<const TensorEvalToOp<RHSExpr>, Dev> > {};
template<typename Dim, size_t NumOutputDim> struct DimConstr {
template<typename InDim>
- static EIGEN_STRONG_INLINE Dim getDim(InDim dims ) {return dims;}
+ static inline Dim getDim(InDim dims ) {return dims;}
};
template<typename Dim> struct DimConstr<Dim, 0> {
template<typename InDim>
- static EIGEN_STRONG_INLINE Dim getDim(InDim dims ) {return Dim(static_cast<Dim>(dims.TotalSize()));}
-};
-
-#define SYCLEXTRFUNCREDUCTIONOP(CVQual)\
-template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{\
- typedef TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Evaluator;\
- typedef typename Eigen::internal::conditional<Evaluator::NumOutputDims==0, DSizes<typename Evaluator::Index, 1>, typename Evaluator::Dimensions >::type Dimensions;\
- const Dimensions m_dimensions;\
- EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
- FunctorExtractor(const TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>& expr)\
- : m_dimensions(DimConstr<Dimensions, Evaluator::NumOutputDims>::getDim(expr.dimensions())) {}\
-};
-
-
-SYCLEXTRFUNCREDUCTIONOP(const)
-SYCLEXTRFUNCREDUCTIONOP()
-#undef SYCLEXTRFUNCREDUCTIONOP
-
-#define SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(CVQual, ExprNode)\
-template<typename Indices, typename LhsXprType, typename RhsXprType, typename Device>\
-struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>>{\
- typedef TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device> Evaluator;\
- typedef typename Evaluator::Dimensions Dimensions;\
- const Dimensions m_dimensions;\
- EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
- FunctorExtractor(const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>& expr)\
- : m_dimensions(expr.dimensions()) {}\
+ static inline Dim getDim(InDim dims ) {return Dim(dims.TotalSize());}
};
-
-SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorContractionOp)
-SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorContractionOp)
-SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorConvolutionOp)
-SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorConvolutionOp)
-#undef SYCLEXTRFUNCCONTRACTCONVOLUTIONOP
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// const TensorSlicingOp. This is an specialisation without OP so it has to be separated.
-#define SYCLEXTRFUNCTSLICEOP(CVQual)\
-template <typename StartIndices, typename Sizes, typename XprType, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev> > {\
- FunctorExtractor<TensorEvaluator<XprType, Dev> > xprExpr;\
- const StartIndices m_offsets;\
- const Sizes m_dimensions;\
- FunctorExtractor(const TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev>& expr)\
- : xprExpr(expr.impl()), m_offsets(expr.startIndices()), m_dimensions(expr.dimensions()) {}\
- EIGEN_STRONG_INLINE const StartIndices& startIndices() const {return m_offsets;}\
- EIGEN_STRONG_INLINE const Sizes& dimensions() const {return m_dimensions;}\
-};
-
-SYCLEXTRFUNCTSLICEOP(const)
-SYCLEXTRFUNCTSLICEOP()
-#undef SYCLEXTRFUNCTSLICEOP
-
-#define SYCLEXTRFUNCTSLICESTRIDEOP(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev> >{\
- FunctorExtractor<TensorEvaluator<XprType, Dev> > xprExpr;\
- const StartIndices m_startIndices;\
- const StopIndices m_stopIndices;\
- const Strides m_strides;\
- FunctorExtractor(const TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices,Strides, XprType>, Dev>& expr)\
- : xprExpr(expr.impl()), m_startIndices(expr.exprStartIndices()), m_stopIndices(expr.exprStopIndices()), m_strides(expr.strides()) {}\
- EIGEN_STRONG_INLINE const StartIndices& startIndices() const { return m_startIndices; }\
- EIGEN_STRONG_INLINE const StartIndices& stopIndices() const { return m_stopIndices; }\
- EIGEN_STRONG_INLINE const StartIndices& strides() const { return m_strides; }\
-};
-
-SYCLEXTRFUNCTSLICESTRIDEOP(const)
-SYCLEXTRFUNCTSLICESTRIDEOP()
-#undef SYCLEXTRFUNCTSLICESTRIDEOP
-
-// Had to separate reshapeOP otherwise it will be mistaken by UnaryCategory
-#define SYCLRESHAPEANDSHUFFLEOPFUNCEXT(OPEXPR, FUNCCALL, CVQual)\
-template<typename Param, typename XprType, typename Dev>\
-struct FunctorExtractor<Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev> > {\
- FunctorExtractor<Eigen::TensorEvaluator<XprType, Dev> > xprExpr;\
- const Param m_param;\
- EIGEN_STRONG_INLINE const Param& param() const { return m_param; }\
- FunctorExtractor(const Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev>& expr)\
- : xprExpr(expr.impl()), m_param(expr.FUNCCALL) {}\
-};
-
-SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), const)
-SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), )
-
-SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), const)
-SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), )
-#undef SYCLRESHAPEANDSHUFFLEOPFUNCEXT
-
-// Had to separate reshapeOP otherwise it will be mistaken by UnaryCategory
-#define PADDINGOPFUNCEXT(OPEXPR, FUNCCALL, SCALARFUNCCALL, CVQual)\
-template<typename Param, typename XprType, typename Dev>\
-struct FunctorExtractor<Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev> > {\
- FunctorExtractor<Eigen::TensorEvaluator<XprType, Dev> > xprExpr;\
- const Param m_param;\
- typedef typename Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev>::Scalar Scalar;\
- const Scalar m_scalar_param;\
- EIGEN_STRONG_INLINE const Param& param() const { return m_param; }\
- EIGEN_STRONG_INLINE const Scalar& scalar_param() const { return m_scalar_param; }\
- FunctorExtractor(const Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev>& expr)\
- : xprExpr(expr.impl()), m_param(expr.FUNCCALL), m_scalar_param(expr.SCALARFUNCCALL) {}\
-};
-
-PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), const)
-PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), )
-#undef PADDINGOPFUNCEXT
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is TensorContractionOp and TensorConcatenationOp
-/// for TensorContractionOp the LHS and RHS here are the original one no need to apply condition on their type.
-#define SYCLEXTRFUNCCONTRACTCONCAT(OPEXPR, FUNCCALL, CVQual)\
-template <typename Param, typename LHSExpr, typename RHSExpr, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual OPEXPR<Param, LHSExpr, RHSExpr>, Dev> > {\
- FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\
- FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
- const Param func;\
- FunctorExtractor(const TensorEvaluator<CVQual OPEXPR<Param, LHSExpr, RHSExpr>, Dev>& expr)\
- : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.FUNCCALL) {}\
-};
-
-// TensorConcatenationOp
-SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(), const)
-SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(),)
-#undef SYCLEXTRFUNCCONTRACTCONCAT
-
-//TensorChippingOp
-#define SYCLEXTRFUNCCHIPPINGOP(CVQual)\
-template<DenseIndex DimId, typename XprType, typename Device>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Device>>{\
- FunctorExtractor<Eigen::TensorEvaluator<XprType, Device> > xprExpr;\
- const DenseIndex m_dim;\
- const DenseIndex m_offset;\
- EIGEN_STRONG_INLINE const DenseIndex& dimId() const { return m_dim; }\
- EIGEN_STRONG_INLINE const DenseIndex& offset() const { return m_offset; }\
- FunctorExtractor(const TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Device>& expr)\
- : xprExpr(expr.impl()), m_dim(expr.dimId()), m_offset(expr.offset()) {}\
+template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
+struct FunctorExtractor<TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{
+ typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Evaluator;
+ typedef typename Eigen::internal::conditional<Evaluator::NumOutputDims==0, DSizes<typename Evaluator::Index, 1>, typename Evaluator::Dimensions >::type Dimensions;
+ const Dimensions m_dimensions;
+ const Dimensions& dimensions() const { return m_dimensions; }
+ FunctorExtractor(const TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>& expr)
+ : m_dimensions(DimConstr<Dimensions, Evaluator::NumOutputDims>::getDim(expr.dimensions())) {}
};
-SYCLEXTRFUNCCHIPPINGOP(const)
-SYCLEXTRFUNCCHIPPINGOP()
-#undef SYCLEXTRFUNCCHIPPINGOP
+template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
+struct FunctorExtractor<TensorEvaluator<TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>
+: FunctorExtractor<TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{};
/// template deduction function for FunctorExtractor
template <typename Evaluator>
auto inline extractFunctors(const Evaluator& evaluator)-> FunctorExtractor<Evaluator> {
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
deleted file mode 100644
index 2f77790..0000000
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
+++ /dev/null
@@ -1,245 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli Codeplay Software Ltd.
-// Ralph Potter Codeplay Software Ltd.
-// Luke Iwanski Codeplay Software Ltd.
-// Contact: eigen@codeplay.com
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// General include header of SYCL target for Tensor Module
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-
- template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{
- OP op;
- OutputAccessor aOut;
- InputAccessor aI;
- LocalAccessor scratch;
- size_t length, local;
- GenericKernelReducer(OP op_, OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
- : op(op_), aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
- void operator()(cl::sycl::nd_item<1> itemID) {
- size_t globalid = itemID.get_global(0);
- size_t localid = itemID.get_local(0);
- /* All threads collectively read from global memory into local.
- * The barrier ensures all threads' IO is resolved before
- * execution continues (strictly speaking, all threads within
- * a single work-group - there is no co-ordination between
- * work-groups, only work-items). */
- if (globalid < length) {
- scratch[localid] = aI[globalid];
- }
- itemID.barrier(cl::sycl::access::fence_space::local_space);
-
- /* Apply the reduction operation between the current local
- * id and the one on the other half of the vector. */
- if (globalid < length) {
- auto min = (length < local) ? length : local;
- for (size_t offset = min / 2; offset > 0; offset /= 2) {
- if (localid < offset) {
- auto accum = op.initialize();
- op.reduce(scratch[localid], &accum);
- op.reduce(scratch[localid + offset], &accum);
- op.finalize(accum);
- scratch[localid]=accum;
- //scratch[localid] += scratch[localid + offset];
- }
- itemID.barrier(cl::sycl::access::fence_space::local_space);
- }
- /* The final result will be stored in local id 0. */
- if (localid == 0) {
- aI[itemID.get_group(0)] = scratch[localid];
- if((length<=local) && globalid ==0){
- auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut);
- aOutPtr[0]=scratch[0];
- }
- }
- }
- }
-
- };
-
-/// ReductionFunctor
-template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor {
- public:
- typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
- typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
- ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index)
- :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {}
- void operator()(cl::sycl::nd_item<1> itemID) {
-
- typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
- auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
- /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
- /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
- const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
- /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
- /// the device_evaluator is detectable and recognisable on the device.
- typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf;
- auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
- auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
- /// const cast added as a naive solution to solve the qualifier drop error
- auto globalid=static_cast<Index>(itemID.get_global_linear_id());
- if (globalid< range) {
- typename DeviceSelf::CoeffReturnType accum = functor.initialize();
- Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
- functor.finalize(accum);
- output_accessor_ptr[globalid]= accum;
- }
- }
- private:
- write_accessor output_accessor;
- FunctorExpr functors;
- Tuple_of_Acc tuple_of_accessors;
- Dims dims;
- Op functor;
- Index range;
-};
-
-template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Index>
-class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index> {
- public:
- typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
- typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
- typedef Eigen::internal::SumReducer<typename HostExpr::CoeffReturnType> Op;
- ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_,
- Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index range_, Index num_values_to_reduce_)
- :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {}
- void operator()(cl::sycl::nd_item<1> itemID) {
-
- typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
- auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
- /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
- /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
- const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
- /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
- /// the device_evaluator is detectable and recognisable on the device.
- typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf;
- auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
- auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
- /// const cast added as a naive solution to solve the qualifier drop error
- auto globalid=static_cast<Index>(itemID.get_global_linear_id());
- if (globalid< range) {
- typename DeviceSelf::CoeffReturnType accum = functor.initialize();
- Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
- functor.finalize(accum);
- output_accessor_ptr[globalid]= accum/num_values_to_reduce;
- }
- }
- private:
- write_accessor output_accessor;
- FunctorExpr functors;
- Tuple_of_Acc tuple_of_accessors;
- Dims dims;
- Op functor;
- Index range;
- Index num_values_to_reduce;
-};
-
-template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Op, typename Dims, typename Index, typename TupleType>
-class FullReductionKernelFunctor{
-public:
- typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
- OutAccessor tmp_global_accessor;
- Index rng , remaining, red_factor;
- Op op;
- Dims dims;
- FunctorExpr functors;
- TupleType tuple_of_accessors;
-
- FullReductionKernelFunctor(OutAccessor acc, Index rng_, Index remaining_, Index red_factor_, Op op_, Dims dims_, FunctorExpr functors_, TupleType t_acc)
- :tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(op_), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){}
-
- void operator()(cl::sycl::nd_item<1> itemID) {
-
- typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
- auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
- /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
- /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
- const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
- /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
- /// the device_evaluator is detectable and recognisable on the device.
- auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
- /// const cast added as a naive solution to solve the qualifier drop error
- auto globalid=itemID.get_global_linear_id();
-
- tmp_global_accessor.get_pointer()[globalid]=(globalid<rng) ? Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op))
- : static_cast<CoeffReturnType>(op.initialize());
-
- if(remaining!=0 && globalid==0 ){
- // this will add the rest of input buffer when the input size is not devidable to red_factor.
- auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::
- reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
- auto accum = op.initialize();
- op.reduce(tmp_global_accessor.get_pointer()[0], &accum);
- op.reduce(remaining_reduce, &accum);
- op.finalize(accum);
- tmp_global_accessor.get_pointer()[0]=accum;
-
- }
- }
-};
-
-template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Dims, typename Index, typename TupleType>
-class FullReductionKernelFunctor<CoeffReturnType, OutAccessor, HostExpr, FunctorExpr, Eigen::internal::MeanReducer<CoeffReturnType>, Dims, Index, TupleType>{
-public:
- typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
- typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
-
- OutAccessor tmp_global_accessor;
- Index rng , remaining, red_factor;
- Op op;
- Dims dims;
- FunctorExpr functors;
- TupleType tuple_of_accessors;
-
- FullReductionKernelFunctor(OutAccessor acc, Index rng_, Index remaining_, Index red_factor_, Eigen::internal::MeanReducer<CoeffReturnType>, Dims dims_, FunctorExpr functors_, TupleType t_acc)
- :tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(Op()), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){}
-
- void operator()(cl::sycl::nd_item<1> itemID) {
-
- typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
- auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
- /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
- /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
- const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
- /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
- /// the device_evaluator is detectable and recognisable on the device.
- auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
- /// const cast added as a naive solution to solve the qualifier drop error
- auto globalid=itemID.get_global_linear_id();
- auto scale = (rng*red_factor) + remaining;
-
- tmp_global_accessor.get_pointer()[globalid]= (globalid<rng)? ((Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op)))/scale)
- :static_cast<CoeffReturnType>(op.initialize())/scale;
-
- if(remaining!=0 && globalid==0 ){
- // this will add the rest of input buffer when the input size is not devidable to red_factor.
- auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
- auto accum = op.initialize();
- tmp_global_accessor.get_pointer()[0]= tmp_global_accessor.get_pointer()[0]*scale;
- op.reduce(tmp_global_accessor.get_pointer()[0], &accum);
- op.reduce(remaining_reduce, &accum);
- op.finalize(accum);
- tmp_global_accessor.get_pointer()[0]=accum/scale;
-
- }
- }
-};
-
-}
-}
-}
-#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
index a1c112f..25d1fac 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
@@ -44,120 +44,68 @@ struct CategoryCount<Arg,Args...>{
};
/// specialisation of the \ref LeafCount struct when the node type is const TensorMap
-#define SYCLTENSORMAPLEAFCOUNT(CVQual)\
-template <typename PlainObjectType, int Options_, template <class> class MakePointer_>\
-struct LeafCount<CVQual TensorMap<PlainObjectType, Options_, MakePointer_> > {\
- static const size_t Count =1;\
+template <typename PlainObjectType, int Options_, template <class> class MakePointer_>
+struct LeafCount<const TensorMap<PlainObjectType, Options_, MakePointer_> > {
+ static const size_t Count =1;
};
-SYCLTENSORMAPLEAFCOUNT(const)
-SYCLTENSORMAPLEAFCOUNT()
-#undef SYCLTENSORMAPLEAFCOUNT
+/// specialisation of the \ref LeafCount struct when the node type is TensorMap
+template <typename PlainObjectType, int Options_, template <class> class MakePointer_>
+struct LeafCount<TensorMap<PlainObjectType, Options_, MakePointer_> > :LeafCount<const TensorMap<PlainObjectType, Options_, MakePointer_> >{};
-// TensorCwiseUnaryOp, TensorCwiseNullaryOp, TensorCwiseBinaryOp, TensorCwiseTernaryOp, and TensorBroadcastingOp
-#define SYCLCATEGORYLEAFCOUNT(CVQual)\
-template <template <class, class...> class CategoryExpr, typename OP, typename... RHSExpr>\
-struct LeafCount<CVQual CategoryExpr<OP, RHSExpr...> >: CategoryCount<RHSExpr...> {};
-
-SYCLCATEGORYLEAFCOUNT(const)
-SYCLCATEGORYLEAFCOUNT()
-#undef SYCLCATEGORYLEAFCOUNT
+// const TensorCwiseUnaryOp, const TensorCwiseNullaryOp, const TensorCwiseBinaryOp, const TensorCwiseTernaryOp, and Const TensorBroadcastingOp
+template <template <class, class...> class CategoryExpr, typename OP, typename... RHSExpr>
+struct LeafCount<const CategoryExpr<OP, RHSExpr...> >: CategoryCount<RHSExpr...> {};
+// TensorCwiseUnaryOp, TensorCwiseNullaryOp, TensorCwiseBinaryOp, TensorCwiseTernaryOp, and TensorBroadcastingOp
+template <template <class, class...> class CategoryExpr, typename OP, typename... RHSExpr>
+struct LeafCount<CategoryExpr<OP, RHSExpr...> > :LeafCount<const CategoryExpr<OP, RHSExpr...> >{};
/// specialisation of the \ref LeafCount struct when the node type is const TensorSelectOp is an exception
-#define SYCLSELECTOPLEAFCOUNT(CVQual)\
-template <typename IfExpr, typename ThenExpr, typename ElseExpr>\
-struct LeafCount<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr> > : CategoryCount<IfExpr, ThenExpr, ElseExpr> {};
-
-SYCLSELECTOPLEAFCOUNT(const)
-SYCLSELECTOPLEAFCOUNT()
-#undef SYCLSELECTOPLEAFCOUNT
+template <typename IfExpr, typename ThenExpr, typename ElseExpr>
+struct LeafCount<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr> > : CategoryCount<IfExpr, ThenExpr, ElseExpr> {};
+/// specialisation of the \ref LeafCount struct when the node type is TensorSelectOp
+template <typename IfExpr, typename ThenExpr, typename ElseExpr>
+struct LeafCount<TensorSelectOp<IfExpr, ThenExpr, ElseExpr> >: LeafCount<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr> > {};
-/// specialisation of the \ref LeafCount struct when the node type is TensorAssignOp
-#define SYCLLEAFCOUNTASSIGNOP(CVQual)\
-template <typename LHSExpr, typename RHSExpr>\
-struct LeafCount<CVQual TensorAssignOp<LHSExpr, RHSExpr> >: CategoryCount<LHSExpr,RHSExpr> {};
+/// specialisation of the \ref LeafCount struct when the node type is const TensorAssignOp
+template <typename LHSExpr, typename RHSExpr>
+struct LeafCount<const TensorAssignOp<LHSExpr, RHSExpr> >: CategoryCount<LHSExpr,RHSExpr> {};
-SYCLLEAFCOUNTASSIGNOP(const)
-SYCLLEAFCOUNTASSIGNOP()
-#undef SYCLLEAFCOUNTASSIGNOP
+/// specialisation of the \ref LeafCount struct when the node type is
+/// TensorAssignOp is an exception. It is not the same as Unary
+template <typename LHSExpr, typename RHSExpr>
+struct LeafCount<TensorAssignOp<LHSExpr, RHSExpr> > :LeafCount<const TensorAssignOp<LHSExpr, RHSExpr> >{};
/// specialisation of the \ref LeafCount struct when the node type is const TensorForcedEvalOp
-#define SYCLFORCEDEVALLEAFCOUNT(CVQual)\
-template <typename Expr>\
-struct LeafCount<CVQual TensorForcedEvalOp<Expr> > {\
- static const size_t Count =1;\
+template <typename Expr>
+struct LeafCount<const TensorForcedEvalOp<Expr> > {
+ static const size_t Count =1;
};
-SYCLFORCEDEVALLEAFCOUNT(const)
-SYCLFORCEDEVALLEAFCOUNT()
-#undef SYCLFORCEDEVALLEAFCOUNT
+/// specialisation of the \ref LeafCount struct when the node type is TensorForcedEvalOp
+template <typename Expr>
+struct LeafCount<TensorForcedEvalOp<Expr> >: LeafCount<const TensorForcedEvalOp<Expr> > {};
-/// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp
-#define EVALTOLEAFCOUNT(CVQual)\
-template <typename Expr>\
-struct LeafCount<CVQual TensorEvalToOp<Expr> > {\
- static const size_t Count = 1 + CategoryCount<Expr>::Count;\
+/// specialisation of the \ref LeafCount struct when the node type is const TensorEvalToOp
+template <typename Expr>
+struct LeafCount<const TensorEvalToOp<Expr> > {
+ static const size_t Count = 1 + CategoryCount<Expr>::Count;
};
-EVALTOLEAFCOUNT(const)
-EVALTOLEAFCOUNT()
-#undef EVALTOLEAFCOUNT
-
/// specialisation of the \ref LeafCount struct when the node type is const TensorReductionOp
-#define REDUCTIONLEAFCOUNT(CVQual)\
-template <typename OP, typename Dim, typename Expr>\
-struct LeafCount<CVQual TensorReductionOp<OP, Dim, Expr> > {\
- static const size_t Count =1;\
+template <typename OP, typename Dim, typename Expr>
+struct LeafCount<const TensorReductionOp<OP, Dim, Expr> > {
+ static const size_t Count =1;
};
-REDUCTIONLEAFCOUNT(const)
-REDUCTIONLEAFCOUNT()
-#undef REDUCTIONLEAFCOUNT
-
-/// specialisation of the \ref LeafCount struct when the node type is const TensorContractionOp
-#define CONTRACTIONCONVOLUTIONLEAFCOUNT(CVQual, ExprNode)\
-template <typename Indices, typename LhsXprType, typename RhsXprType>\
-struct LeafCount<CVQual ExprNode<Indices, LhsXprType, RhsXprType> > {\
- static const size_t Count =1;\
-};
-
-CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorContractionOp)
-CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorContractionOp)
-CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorConvolutionOp)
-CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorConvolutionOp)
-#undef CONTRACTIONCONVOLUTIONLEAFCOUNT
-
-
-
-/// specialisation of the \ref LeafCount struct when the node type is TensorSlicingOp
-#define SLICEOPLEAFCOUNT(CVQual)\
-template <typename StartIndices, typename Sizes, typename XprType>\
-struct LeafCount<CVQual TensorSlicingOp<StartIndices, Sizes, XprType> >:CategoryCount<XprType>{};
-
-SLICEOPLEAFCOUNT(const)
-SLICEOPLEAFCOUNT()
-#undef SLICEOPLEAFCOUNT
-
-
-/// specialisation of the \ref LeafCount struct when the node type is TensorChippingOp
-#define CHIPPINGOPLEAFCOUNT(CVQual)\
-template <DenseIndex DimId, typename XprType>\
-struct LeafCount<CVQual TensorChippingOp<DimId, XprType> >:CategoryCount<XprType>{};
-
-CHIPPINGOPLEAFCOUNT(const)
-CHIPPINGOPLEAFCOUNT()
-#undef CHIPPINGOPLEAFCOUNT
-
-
-#define SLICESTRIDEOPLEAFCOUNT(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>\
-struct LeafCount<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >:CategoryCount<XprType>{};
-
-SLICESTRIDEOPLEAFCOUNT(const)
-SLICESTRIDEOPLEAFCOUNT()
-#undef SLICESTRIDEOPLEAFCOUNT
+/// specialisation of the \ref LeafCount struct when the node type is TensorReductionOp
+template <typename OP, typename Dim, typename Expr>
+struct LeafCount<TensorReductionOp<OP, Dim, Expr> >: LeafCount<const TensorReductionOp<OP, Dim, Expr> >{};
+/// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp
+template <typename Expr>
+struct LeafCount<TensorEvalToOp<Expr> >: LeafCount<const TensorEvalToOp<Expr> >{};
} /// namespace TensorSycl
} /// namespace internal
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
index 74566dc..d4c250c 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
@@ -122,9 +122,9 @@ ASSIGNEXPR()
/// specialisation of the \ref PlaceHolderExpression when the node is
/// TensorMap
#define TENSORMAPEXPR(CVQual)\
-template <typename T, int Options_, template <class> class MakePointer_, size_t N>\
-struct PlaceHolderExpression< CVQual TensorMap< T, Options_, MakePointer_>, N> {\
- typedef CVQual PlaceHolder<CVQual TensorMap<T, Options_, MakePointer_>, N> Type;\
+template <typename Scalar_, int Options_, int Options2_, int NumIndices_, typename IndexType_, template <class> class MakePointer_, size_t N>\
+struct PlaceHolderExpression< CVQual TensorMap< Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakePointer_>, N> {\
+ typedef CVQual PlaceHolder<CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakePointer_>, N> Type;\
};
TENSORMAPEXPR(const)
@@ -157,18 +157,6 @@ EVALTO()
/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorChippingOp
-#define CHIPPINGOP(CVQual)\
-template <DenseIndex DimId, typename Expr, size_t N>\
-struct PlaceHolderExpression<CVQual TensorChippingOp<DimId, Expr>, N> {\
- typedef CVQual TensorChippingOp< DimId, typename CalculateIndex <N, Expr>::ArgType> Type;\
-};
-
-CHIPPINGOP(const)
-CHIPPINGOP()
-#undef CHIPPINGOP
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
/// TensorReductionOp
#define SYCLREDUCTION(CVQual)\
template <typename OP, typename Dims, typename Expr, size_t N>\
@@ -179,45 +167,6 @@ SYCLREDUCTION(const)
SYCLREDUCTION()
#undef SYCLREDUCTION
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorReductionOp
-#define SYCLCONTRACTIONCONVOLUTIONPLH(CVQual, ExprNode)\
-template <typename Indices, typename LhsXprType, typename RhsXprType, size_t N>\
-struct PlaceHolderExpression<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>{\
- typedef CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N> Type;\
-};
-SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTIONPLH(,TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorConvolutionOp)
-SYCLCONTRACTIONCONVOLUTIONPLH(,TensorConvolutionOp)
-#undef SYCLCONTRACTIONCONVOLUTIONPLH
-
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorCwiseSelectOp
-#define SLICEOPEXPR(CVQual)\
-template <typename StartIndices, typename Sizes, typename XprType, size_t N>\
-struct PlaceHolderExpression<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, N> {\
- typedef CVQual TensorSlicingOp<StartIndices, Sizes, typename CalculateIndex<N, XprType>::ArgType> Type;\
-};
-
-SLICEOPEXPR(const)
-SLICEOPEXPR()
-#undef SLICEOPEXPR
-
-
-#define SYCLSLICESTRIDEOPPLH(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, size_t N>\
-struct PlaceHolderExpression<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, N> {\
- typedef CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, typename CalculateIndex<N, XprType>::ArgType> Type;\
-};
-
-SYCLSLICESTRIDEOPPLH(const)
-SYCLSLICESTRIDEOPPLH()
-#undef SYCLSLICESTRIDEOPPLH
-
-
/// template deduction for \ref PlaceHolderExpression struct
template <typename Expr>
struct createPlaceHolderExpression {
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
index cac7855..7914b6f 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
@@ -25,70 +25,43 @@
namespace Eigen {
namespace TensorSycl {
-
-template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecExprFunctorKernel{
- typedef typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr;
-
- typedef typename Expr::Index Index;
- FunctorExpr functors;
- TupleType tuple_of_accessors;
- Index range;
- ExecExprFunctorKernel(Index range_, FunctorExpr functors_, TupleType tuple_of_accessors_)
- : functors(functors_), tuple_of_accessors(tuple_of_accessors_), range(range_){}
- void operator()(cl::sycl::nd_item<1> itemID) {
- typedef typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr;
- auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
- typename DevExpr::Index gId = static_cast<typename DevExpr::Index>(itemID.get_global_linear_id());
- if (gId < range)
- device_evaluator.evalScalar(gId);
- }
-};
-
/// The run function in tensor sycl convert the expression tree to a buffer
/// based expression tree;
/// creates the expression tree for the device with accessor to buffers;
/// construct the kernel and submit it to the sycl queue.
-/// std::array does not have TotalSize. So I have to get the size through template specialisation.
-template<typename , typename Dimensions> struct DimensionSize{
- static auto getDimSize(const Dimensions& dim)->decltype(dim.TotalSize()){
- return dim.TotalSize();
- }
-};
-#define DIMSIZEMACRO(CVQual)\
-template<typename Index, size_t NumDims> struct DimensionSize<Index, CVQual std::array<Index, NumDims>>{\
- static inline Index getDimSize(const std::array<Index, NumDims>& dim){\
- return (NumDims == 0) ? 1 : ::Eigen::internal::array_prod(dim);\
- }\
-};
-
-DIMSIZEMACRO(const)
-DIMSIZEMACRO()
-#undef DIMSIZEMACRO
-
-
template <typename Expr, typename Dev>
void run(Expr &expr, Dev &dev) {
Eigen::TensorEvaluator<Expr, Dev> evaluator(expr, dev);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
- typedef Eigen::TensorSycl::internal::FunctorExtractor<Eigen::TensorEvaluator<Expr, Dev> > FunctorExpr;
- FunctorExpr functors = internal::extractFunctors(evaluator);
- dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
- // create a tuple of accessors from Evaluator
- typedef decltype(internal::createTupleOfAccessors<Eigen::TensorEvaluator<Expr, Dev> >(cgh, evaluator)) TupleType;
- TupleType tuple_of_accessors = internal::createTupleOfAccessors<Eigen::TensorEvaluator<Expr, Dev> >(cgh, evaluator);
- typename Expr::Index range, GRange, tileSize;
- typename Expr::Index total_size = static_cast<typename Expr::Index>(DimensionSize<typename Expr::Index, typename Eigen::TensorEvaluator<Expr, Dev>::Dimensions>::getDimSize(evaluator.dimensions()));
- dev.parallel_for_setup(total_size, tileSize, range, GRange);
+ typedef typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr;
+ auto functors = internal::extractFunctors(evaluator);
- cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
- ExecExprFunctorKernel<Expr,FunctorExpr,TupleType>(range
- , functors, tuple_of_accessors
- ));
+ size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
+ dev.m_queue.submit([&](cl::sycl::handler &cgh) {
+
+ // create a tuple of accessors from Evaluator
+ auto tuple_of_accessors = internal::createTupleOfAccessors<decltype(evaluator)>(cgh, evaluator);
+ const auto range = utility::tuple::get<0>(tuple_of_accessors).get_range()[0];
+ size_t GRange=range;
+ if (tileSize>GRange) tileSize=GRange;
+ else if(GRange>tileSize){
+ size_t xMode = GRange % tileSize;
+ if (xMode != 0) GRange += (tileSize - xMode);
+ }
+ // run the kernel
+ cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
+ typedef typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr;
+ auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+ auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
+ if (itemID.get_global_linear_id() < range) {
+ device_evaluator.evalScalar(static_cast<int>(itemID.get_global_linear_id()));
+ }
+ });
});
- dev.asynchronousExec();
+ dev.m_queue.throw_asynchronous();
}
+
evaluator.cleanup();
}
} // namespace TensorSycl
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
index 58ab0f0..063b027 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
@@ -20,7 +20,6 @@
#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
-
namespace utility {
namespace tuple {
/// \struct StaticIf
@@ -232,5 +231,4 @@ Tuple<Args1..., Args2...> append(Tuple<Args1...> t1,Tuple<Args2...> t2) {
}
} // tuple
} // utility
-
#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
index a1e944e..ffcf8b0 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@@ -58,8 +58,6 @@ struct traits<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
};
template <typename T> struct MakePointer {
typedef T* Type;
- typedef T& RefType;
-
};
};
@@ -78,8 +76,6 @@ struct traits<TensorFixedSize<Scalar_, Dimensions, Options_, IndexType_> >
};
template <typename T> struct MakePointer {
typedef T* Type;
- typedef T& RefType;
-
};
};
@@ -102,8 +98,6 @@ struct traits<TensorMap<PlainObjectType, Options_, MakePointer_> >
// Intermediate typedef to workaround MSVC issue.
typedef MakePointer_<T> MakePointerT;
typedef typename MakePointerT::Type Type;
- typedef typename MakePointerT::RefType RefType;
-
};
};
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
index d23f2e4..3523e7c 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@@ -23,7 +23,6 @@ struct static_val {
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) {
- EIGEN_UNUSED_VARIABLE(v);
eigen_assert(v == n);
}
};
diff --git a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
index 9dcc9da..354bce5 100644
--- a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+++ b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@@ -20,13 +20,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
typedef RunQueue<Task, 1024> Queue;
NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment())
- : NonBlockingThreadPoolTempl(num_threads, true, env) {}
-
- NonBlockingThreadPoolTempl(int num_threads, bool allow_spinning,
- Environment env = Environment())
- : num_threads_(num_threads),
- allow_spinning_(allow_spinning),
- env_(env),
+ : env_(env),
threads_(num_threads),
queues_(num_threads),
coprimes_(num_threads),
@@ -34,20 +28,19 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
blocked_(0),
spinning_(0),
done_(false),
- cancelled_(false),
ec_(waiters_) {
- waiters_.resize(num_threads_);
+ waiters_.resize(num_threads);
- // Calculate coprimes of num_threads_.
+ // Calculate coprimes of num_threads.
// Coprimes are used for a random walk over all threads in Steal
// and NonEmptyQueueIndex. Iteration is based on the fact that if we take
// a walk starting thread index t and calculate num_threads - 1 subsequent
// indices as (t + coprime) % num_threads, we will cover all threads without
// repetitions (effectively getting a presudo-random permutation of thread
// indices).
- for (int i = 1; i <= num_threads_; i++) {
+ for (int i = 1; i <= num_threads; i++) {
unsigned a = i;
- unsigned b = num_threads_;
+ unsigned b = num_threads;
// If GCD(a, b) == 1, then a and b are coprimes.
while (b != 0) {
unsigned tmp = a;
@@ -58,33 +51,24 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
coprimes_.push_back(i);
}
}
- for (int i = 0; i < num_threads_; i++) {
+ for (int i = 0; i < num_threads; i++) {
queues_.push_back(new Queue());
}
- for (int i = 0; i < num_threads_; i++) {
+ for (int i = 0; i < num_threads; i++) {
threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); }));
}
}
~NonBlockingThreadPoolTempl() {
done_ = true;
-
// Now if all threads block without work, they will start exiting.
// But note that threads can continue to work arbitrary long,
// block, submit new work, unblock and otherwise live full life.
- if (!cancelled_) {
- ec_.Notify(true);
- } else {
- // Since we were cancelled, there might be entries in the queues.
- // Empty them to prevent their destructor from asserting.
- for (size_t i = 0; i < queues_.size(); i++) {
- queues_[i]->Flush();
- }
- }
+ ec_.Notify(true);
// Join threads explicitly to avoid destruction order issues.
- for (size_t i = 0; i < num_threads_; i++) delete threads_[i];
- for (size_t i = 0; i < num_threads_; i++) delete queues_[i];
+ for (size_t i = 0; i < threads_.size(); i++) delete threads_[i];
+ for (size_t i = 0; i < threads_.size(); i++) delete queues_[i];
}
void Schedule(std::function<void()> fn) {
@@ -107,31 +91,14 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
// completes overall computations, which in turn leads to destruction of
// this. We expect that such scenario is prevented by program, that is,
// this is kept alive while any threads can potentially be in Schedule.
- if (!t.f) {
+ if (!t.f)
ec_.Notify(false);
- }
- else {
+ else
env_.ExecuteTask(t); // Push failed, execute directly.
- }
- }
-
- void Cancel() {
- cancelled_ = true;
- done_ = true;
-
- // Let each thread know it's been cancelled.
-#ifdef EIGEN_THREAD_ENV_SUPPORTS_CANCELLATION
- for (size_t i = 0; i < threads_.size(); i++) {
- threads_[i]->OnCancel();
- }
-#endif
-
- // Wake up the threads without work to let them exit on their own.
- ec_.Notify(true);
}
int NumThreads() const final {
- return num_threads_;
+ return static_cast<int>(threads_.size());
}
int CurrentThreadId() const final {
@@ -155,8 +122,6 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
};
Environment env_;
- const int num_threads_;
- const bool allow_spinning_;
MaxSizeVector<Thread*> threads_;
MaxSizeVector<Queue*> queues_;
MaxSizeVector<unsigned> coprimes_;
@@ -164,7 +129,6 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
std::atomic<unsigned> blocked_;
std::atomic<bool> spinning_;
std::atomic<bool> done_;
- std::atomic<bool> cancelled_;
EventCount ec_;
// Main worker thread loop.
@@ -175,62 +139,32 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
pt->thread_id = thread_id;
Queue* q = queues_[thread_id];
EventCount::Waiter* waiter = &waiters_[thread_id];
- // TODO(dvyukov,rmlarsen): The time spent in Steal() is proportional
- // to num_threads_ and we assume that new work is scheduled at a
- // constant rate, so we set spin_count to 5000 / num_threads_. The
- // constant was picked based on a fair dice roll, tune it.
- const int spin_count =
- allow_spinning_ && num_threads_ > 0 ? 5000 / num_threads_ : 0;
- if (num_threads_ == 1) {
- // For num_threads_ == 1 there is no point in going through the expensive
- // steal loop. Moreover, since Steal() calls PopBack() on the victim
- // queues it might reverse the order in which ops are executed compared to
- // the order in which they are scheduled, which tends to be
- // counter-productive for the types of I/O workloads the single thread
- // pools tend to be used for.
- while (!cancelled_) {
- Task t = q->PopFront();
- for (int i = 0; i < spin_count && !t.f; i++) {
- if (!cancelled_.load(std::memory_order_relaxed)) {
- t = q->PopFront();
- }
- }
+ for (;;) {
+ Task t = q->PopFront();
+ if (!t.f) {
+ t = Steal();
if (!t.f) {
- if (!WaitForWork(waiter, &t)) {
- return;
+ // Leave one thread spinning. This reduces latency.
+ // TODO(dvyukov): 1000 iterations is based on fair dice roll, tune it.
+ // Also, the time it takes to attempt to steal work 1000 times depends
+ // on the size of the thread pool. However the speed at which the user
+ // of the thread pool submit tasks is independent of the size of the
+ // pool. Consider a time based limit instead.
+ if (!spinning_ && !spinning_.exchange(true)) {
+ for (int i = 0; i < 1000 && !t.f; i++) {
+ t = Steal();
+ }
+ spinning_ = false;
}
- }
- if (t.f) {
- env_.ExecuteTask(t);
- }
- }
- } else {
- while (!cancelled_) {
- Task t = q->PopFront();
- if (!t.f) {
- t = Steal();
if (!t.f) {
- // Leave one thread spinning. This reduces latency.
- if (allow_spinning_ && !spinning_ && !spinning_.exchange(true)) {
- for (int i = 0; i < spin_count && !t.f; i++) {
- if (!cancelled_.load(std::memory_order_relaxed)) {
- t = Steal();
- } else {
- return;
- }
- }
- spinning_ = false;
- }
- if (!t.f) {
- if (!WaitForWork(waiter, &t)) {
- return;
- }
+ if (!WaitForWork(waiter, &t)) {
+ return;
}
}
}
- if (t.f) {
- env_.ExecuteTask(t);
- }
+ }
+ if (t.f) {
+ env_.ExecuteTask(t);
}
}
}
@@ -267,18 +201,14 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
int victim = NonEmptyQueueIndex();
if (victim != -1) {
ec_.CancelWait(waiter);
- if (cancelled_) {
- return false;
- } else {
- *t = queues_[victim]->PopBack();
- return true;
- }
+ *t = queues_[victim]->PopBack();
+ return true;
}
// Number of blocked threads is used as termination condition.
// If we are shutting down and all worker threads blocked without work,
// that's we are done.
blocked_++;
- if (done_ && blocked_ == num_threads_) {
+ if (done_ && blocked_ == threads_.size()) {
ec_.CancelWait(waiter);
// Almost done, but need to re-check queues.
// Consider that all queues are empty and all worker threads are preempted
diff --git a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
index 49d0cdc..05ed76c 100644
--- a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
+++ b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
@@ -177,13 +177,6 @@ class RunQueue {
// Can be called by any thread at any time.
bool Empty() const { return Size() == 0; }
- // Delete all the elements from the queue.
- void Flush() {
- while (!Empty()) {
- PopFront();
- }
- }
-
private:
static const unsigned kMask = kSize - 1;
static const unsigned kMask2 = (kSize << 1) - 1;
diff --git a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
index 3357286..e75d0f4 100644
--- a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
+++ b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
@@ -69,14 +69,6 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface {
}
}
- void Cancel() {
-#ifdef EIGEN_THREAD_ENV_SUPPORTS_CANCELLATION
- for (size_t i = 0; i < threads_.size(); i++) {
- threads_[i]->OnCancel();
- }
-#endif
- }
-
int NumThreads() const final {
return static_cast<int>(threads_.size());
}
diff --git a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h
deleted file mode 100644
index a05685f..0000000
--- a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h
+++ /dev/null
@@ -1,23 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
-#define EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
-
-// Try to come up with a portable way to cancel a thread
-#if EIGEN_OS_GNULINUX
- #define EIGEN_THREAD_CANCEL(t) \
- pthread_cancel(t.native_handle());
- #define EIGEN_SUPPORTS_THREAD_CANCELLATION 1
-#else
-#define EIGEN_THREAD_CANCEL(t)
-#endif
-
-
-#endif // EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
diff --git a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
index d94a064..399f95c 100644
--- a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
+++ b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
@@ -23,8 +23,6 @@ struct StlThreadEnvironment {
public:
EnvThread(std::function<void()> f) : thr_(std::move(f)) {}
~EnvThread() { thr_.join(); }
- // This function is called when the threadpool is cancelled.
- void OnCancel() { }
private:
std::thread thr_;
diff --git a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
index 84e1e6c..a65ee97 100644
--- a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
+++ b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
@@ -16,14 +16,8 @@ namespace Eigen {
// custom thread pools underneath.
class ThreadPoolInterface {
public:
- // Submits a closure to be run by a thread in the pool.
virtual void Schedule(std::function<void()> fn) = 0;
- // If implemented, stop processing the closures that have been enqueued.
- // Currently running closures may still be processed.
- // If not implemented, does nothing.
- virtual void Cancel() {}
-
// Returns the number of threads in the pool.
virtual int NumThreads() const = 0;
diff --git a/eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h b/eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
index 49d315a..ec27edd 100644
--- a/eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
+++ b/eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
@@ -40,7 +40,7 @@ template<typename T, T... nn>
struct numeric_list { constexpr static std::size_t count = sizeof...(nn); };
template<typename T, T n, T... nn>
-struct numeric_list<T, n, nn...> { static const std::size_t count = sizeof...(nn) + 1; const static T first_value = n; };
+struct numeric_list<T, n, nn...> { constexpr static std::size_t count = sizeof...(nn) + 1; constexpr static T first_value = n; };
/* numeric list constructors
*
@@ -123,10 +123,6 @@ template<typename a, typename... as> struct get<0, type_lis
template<typename T, int n, T a, T... as> struct get<n, numeric_list<T, a, as...>> : get<n-1, numeric_list<T, as...>> {};
template<typename T, T a, T... as> struct get<0, numeric_list<T, a, as...>> { constexpr static T value = a; };
-template<std::size_t n, typename T, T a, T... as> constexpr T array_get(const numeric_list<T, a, as...>&) {
- return get<(int)n, numeric_list<T, a, as...>>::value;
-}
-
/* always get type, regardless of dummy; good for parameter pack expansion */
template<typename T, T dummy, typename t> struct id_numeric { typedef t type; };
diff --git a/eigen/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/eigen/unsupported/Eigen/CXX11/src/util/EmulateArray.h
index 573ca43..30d3ebc 100644
--- a/eigen/unsupported/Eigen/CXX11/src/util/EmulateArray.h
+++ b/eigen/unsupported/Eigen/CXX11/src/util/EmulateArray.h
@@ -169,7 +169,6 @@ template <typename T> class array<T, 0> {
#if EIGEN_HAS_VARIADIC_TEMPLATES
EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() {
- EIGEN_UNUSED_VARIABLE(l);
eigen_assert(l.size() == 0);
}
#endif
@@ -201,15 +200,19 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T,N>& a) {
return a[I];
}
+template <typename T> struct array_size;
template<class T, std::size_t N> struct array_size<array<T,N> > {
static const size_t value = N;
};
+template <typename T> struct array_size;
template<class T, std::size_t N> struct array_size<array<T,N>& > {
static const size_t value = N;
};
+template <typename T> struct array_size;
template<class T, std::size_t N> struct array_size<const array<T,N> > {
static const size_t value = N;
};
+template <typename T> struct array_size;
template<class T, std::size_t N> struct array_size<const array<T,N>& > {
static const size_t value = N;
};
@@ -248,6 +251,14 @@ template<std::size_t I, class T, std::size_t N> constexpr inline T const& array_
#undef STD_GET_ARR_HACK
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<const std::array<T,N> > {
+ static const size_t value = N;
+};
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<std::array<T,N> > {
+ static const size_t value = N;
+};
} // end namespace internal
} // end namespace Eigen