summaryrefslogtreecommitdiffhomepage
path: root/eigen/unsupported
diff options
context:
space:
mode:
Diffstat (limited to 'eigen/unsupported')
-rw-r--r--eigen/unsupported/CMakeLists.txt10
-rw-r--r--eigen/unsupported/Eigen/CXX11/Tensor9
-rw-r--r--eigen/unsupported/Eigen/CXX11/ThreadPool13
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/README.md8
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h6
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h2
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h30
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h6
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h287
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h134
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h101
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h76
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h400
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h208
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h3
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h2
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h476
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h7
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h4
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h413
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h13
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h12
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h5
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h11
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h4
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h41
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h14
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h4
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h12
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h8
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h5
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h53
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h7
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h30
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h1
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h179
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h5
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h10
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h8
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h16
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h12
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h54
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h188
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h265
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h322
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h245
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h138
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h57
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h77
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h2
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h6
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h1
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h142
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h7
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h8
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h23
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h2
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h6
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h6
-rw-r--r--eigen/unsupported/Eigen/CXX11/src/util/EmulateArray.h13
-rw-r--r--eigen/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h7
-rw-r--r--eigen/unsupported/Eigen/src/EulerAngles/EulerAngles.h257
-rw-r--r--eigen/unsupported/Eigen/src/EulerAngles/EulerSystem.h184
-rw-r--r--eigen/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h11
-rw-r--r--eigen/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h2
-rw-r--r--eigen/unsupported/Eigen/src/Polynomials/Companion.h50
-rw-r--r--eigen/unsupported/Eigen/src/Polynomials/PolynomialSolver.h18
-rw-r--r--eigen/unsupported/Eigen/src/SparseExtra/MarketIO.h89
-rw-r--r--eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h8
-rw-r--r--eigen/unsupported/doc/examples/EulerAngles.cpp4
-rw-r--r--eigen/unsupported/test/CMakeLists.txt21
-rw-r--r--eigen/unsupported/test/EulerAngles.cpp296
-rw-r--r--eigen/unsupported/test/autodiff_scalar.cpp15
-rw-r--r--eigen/unsupported/test/cxx11_non_blocking_thread_pool.cpp24
-rw-r--r--eigen/unsupported/test/cxx11_tensor_broadcast_sycl.cpp114
-rw-r--r--eigen/unsupported/test/cxx11_tensor_builtins_sycl.cpp267
-rw-r--r--eigen/unsupported/test/cxx11_tensor_chipping.cpp8
-rw-r--r--eigen/unsupported/test/cxx11_tensor_chipping_sycl.cpp622
-rw-r--r--eigen/unsupported/test/cxx11_tensor_concatenation_sycl.cpp180
-rw-r--r--eigen/unsupported/test/cxx11_tensor_contract_sycl.cpp290
-rw-r--r--eigen/unsupported/test/cxx11_tensor_convolution_sycl.cpp469
-rw-r--r--eigen/unsupported/test/cxx11_tensor_device_sycl.cpp60
-rw-r--r--eigen/unsupported/test/cxx11_tensor_expr.cpp46
-rw-r--r--eigen/unsupported/test/cxx11_tensor_fixed_size.cpp2
-rw-r--r--eigen/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp54
-rw-r--r--eigen/unsupported/test/cxx11_tensor_morphing_sycl.cpp248
-rw-r--r--eigen/unsupported/test/cxx11_tensor_notification.cpp17
-rw-r--r--eigen/unsupported/test/cxx11_tensor_of_float16_cuda.cu6
-rw-r--r--eigen/unsupported/test/cxx11_tensor_padding_sycl.cpp157
-rw-r--r--eigen/unsupported/test/cxx11_tensor_reduction_sycl.cpp167
-rw-r--r--eigen/unsupported/test/cxx11_tensor_reverse_sycl.cpp221
-rw-r--r--eigen/unsupported/test/cxx11_tensor_shuffling_sycl.cpp119
-rw-r--r--eigen/unsupported/test/cxx11_tensor_striding_sycl.cpp203
-rw-r--r--eigen/unsupported/test/cxx11_tensor_sycl.cpp219
-rw-r--r--eigen/unsupported/test/polynomialsolver.cpp34
-rw-r--r--eigen/unsupported/test/sparse_extra.cpp23
96 files changed, 1376 insertions, 7343 deletions
diff --git a/eigen/unsupported/CMakeLists.txt b/eigen/unsupported/CMakeLists.txt
index 9a56661..4fef40a 100644
--- a/eigen/unsupported/CMakeLists.txt
+++ b/eigen/unsupported/CMakeLists.txt
@@ -1,9 +1,7 @@
add_subdirectory(Eigen)
add_subdirectory(doc EXCLUDE_FROM_ALL)
-if(BUILD_TESTING)
- if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
- add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest
- else()
- add_subdirectory(test EXCLUDE_FROM_ALL)
- endif()
+if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
+ add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest
+else()
+ add_subdirectory(test EXCLUDE_FROM_ALL)
endif()
diff --git a/eigen/unsupported/Eigen/CXX11/Tensor b/eigen/unsupported/Eigen/CXX11/Tensor
index 3991609..7ecb4c7 100644
--- a/eigen/unsupported/Eigen/CXX11/Tensor
+++ b/eigen/unsupported/Eigen/CXX11/Tensor
@@ -13,14 +13,13 @@
#include "../../../Eigen/Core"
-#if defined(EIGEN_USE_SYCL)
+#ifdef EIGEN_USE_SYCL
#undef min
#undef max
#undef isnan
#undef isinf
#undef isfinite
#include <SYCL/sycl.hpp>
-#include <iostream>
#include <map>
#include <memory>
#include <utility>
@@ -53,10 +52,8 @@ typedef __int32 int32_t;
typedef unsigned __int32 uint32_t;
typedef __int64 int64_t;
typedef unsigned __int64 uint64_t;
-#include <windows.h>
#else
#include <stdint.h>
-#include <unistd.h>
#endif
#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900
@@ -71,10 +68,6 @@ typedef unsigned __int64 uint64_t;
#include <time.h>
#endif
-#if defined(EIGEN_USE_LIBXSMM)
-#include "libxsmm.h"
-#endif
-
#ifdef EIGEN_USE_THREADS
#include "ThreadPool"
#endif
diff --git a/eigen/unsupported/Eigen/CXX11/ThreadPool b/eigen/unsupported/Eigen/CXX11/ThreadPool
index c346141..09d637e 100644
--- a/eigen/unsupported/Eigen/CXX11/ThreadPool
+++ b/eigen/unsupported/Eigen/CXX11/ThreadPool
@@ -50,7 +50,6 @@
#include "src/ThreadPool/ThreadLocal.h"
#include "src/ThreadPool/ThreadYield.h"
-#include "src/ThreadPool/ThreadCancel.h"
#include "src/ThreadPool/EventCount.h"
#include "src/ThreadPool/RunQueue.h"
#include "src/ThreadPool/ThreadPoolInterface.h"
@@ -58,18 +57,6 @@
#include "src/ThreadPool/SimpleThreadPool.h"
#include "src/ThreadPool/NonBlockingThreadPool.h"
-
-// Use the more efficient NonBlockingThreadPool by default.
-namespace Eigen {
-#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
-template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
-typedef NonBlockingThreadPool ThreadPool;
-#else
-template <typename Env> using ThreadPoolTempl = SimpleThreadPoolTempl<Env>;
-typedef SimpleThreadPool ThreadPool;
-#endif
-} // namespace Eigen
-
#endif
#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md b/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md
index 38cdb9c..98e8381 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md
@@ -1737,9 +1737,11 @@ TODO
## Representation of scalar values
-Scalar values are often represented by tensors of size 1 and rank 0.For example
-Tensor<T, N>::maximum() currently returns a Tensor<T, 0>. Similarly, the inner
-product of 2 1d tensors (through contractions) returns a 0d tensor.
+Scalar values are often represented by tensors of size 1 and rank 1. It would be
+more logical and user friendly to use tensors of rank 0 instead. For example
+Tensor<T, N>::maximum() currently returns a Tensor<T, 1>. Similarly, the inner
+product of 2 1d tensors (through contractions) returns a 1d tensor. In the
+future these operations might be updated to return 0d tensors instead.
## Limitations
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index fbe3408..7a45a5c 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -186,12 +186,6 @@ class TensorBase<Derived, ReadOnlyAccessors>
}
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_expm1_op<Scalar>, const Derived>
- expm1() const {
- return unaryExpr(internal::scalar_expm1_op<Scalar>());
- }
-
- EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived>
log() const {
return unaryExpr(internal::scalar_log_op<Scalar>());
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 23a7446..4cfe300 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -54,7 +54,7 @@ struct is_input_scalar<Sizes<> > {
static const bool value = true;
};
#ifndef EIGEN_EMULATE_CXX11_META_H
-template <typename std::ptrdiff_t... Indices>
+template <typename std::size_t... Indices>
struct is_input_scalar<Sizes<Indices...> > {
static const bool value = (Sizes<Indices...>::total_size == 1);
};
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index c46a778..1ba7ef1 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -50,7 +50,6 @@ template <DenseIndex DimId>
struct DimensionId
{
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) {
- EIGEN_UNUSED_VARIABLE(dim);
eigen_assert(dim == DimId);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {
@@ -151,7 +150,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
- : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device), m_offset(op.offset())
+ : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device)
{
EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
eigen_assert(NumInputDims > m_dim.actualDim());
@@ -207,7 +206,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
- (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
+ (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
// m_stride is equal to 1, so let's avoid the integer division.
eigen_assert(m_stride == 1);
Index inputIndex = index * m_inputStride + m_inputOffset;
@@ -219,7 +218,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
} else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims - 1) ||
- (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
+ (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
// m_stride is aways greater than index, so let's avoid the integer division.
eigen_assert(m_stride > index);
return m_impl.template packet<LoadMode>(index + m_inputOffset);
@@ -275,29 +274,17 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
}
}
- /// used by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex dimId() const {
- return m_dim.actualDim();
- }
-
- /// used by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DenseIndex& offset() const {
- return m_offset;
- }
- /// required by sycl in order to extract the accessor
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
-
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
{
Index inputIndex;
if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
- (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
+ (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
// m_stride is equal to 1, so let's avoid the integer division.
eigen_assert(m_stride == 1);
inputIndex = index * m_inputStride + m_inputOffset;
} else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims-1) ||
- (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
+ (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
// m_stride is aways greater than index, so let's avoid the integer division.
eigen_assert(m_stride > index);
inputIndex = index + m_inputOffset;
@@ -317,9 +304,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
TensorEvaluator<ArgType, Device> m_impl;
const internal::DimensionId<DimId> m_dim;
const Device& m_device;
-// required by sycl
- const DenseIndex m_offset;
-
};
@@ -360,7 +344,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == 0) ||
- (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) {
+ (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) {
// m_stride is equal to 1, so let's avoid the integer division.
eigen_assert(this->m_stride == 1);
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
@@ -371,7 +355,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
inputIndex += this->m_inputStride;
}
} else if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == NumInputDims-1) ||
- (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == 0)) {
+ (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == 0)) {
// m_stride is aways greater than index, so let's avoid the integer division.
eigen_assert(this->m_stride > index);
this->m_impl.template writePacket<StoreMode>(index + this->m_inputOffset, x);
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index 2c7ba96..59bf90d 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -276,12 +276,6 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
}
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
- /// required by sycl in order to extract the accessor
- const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
- /// required by sycl in order to extract the accessor
- const TensorEvaluator<RightArgType, Device>& right_impl() const { return m_rightImpl; }
- /// required by sycl in order to extract the accessor
- const Axis& axis() const { return m_axis; }
protected:
Dimensions m_dimensions;
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index bf4a476..20b29e5 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -20,70 +20,6 @@ namespace Eigen {
*
*/
namespace internal {
-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
-template<typename Scalar, typename Index>
-void pack_simple(Scalar * dst, const Scalar * src, Index cols, Index rows, Index lddst, Index ldsrc) {
- size_t psize = packet_traits<Scalar>::size; // Packet size
- typedef typename packet_traits<Scalar>::type Packet; // Packet type
- size_t alignment = psize*sizeof(Scalar); // Needed alignment
- if (rows % psize == 0 && (lddst*sizeof(Scalar)) % alignment == 0 &&
- (ldsrc*sizeof(Scalar)) % alignment == 0 &&
- reinterpret_cast<uintptr_t>(src) % alignment == 0 &&
- reinterpret_cast<uintptr_t>(dst) % alignment == 0) {
- // Optimized version using packets
- size_t num_packets = rows / psize;
- for (Index col = 0; col < cols; ++col) {
- EIGEN_ASM_COMMENT("begin pack_simple inner copy");
- // Unrolled manually 4 times.
- for (size_t i=0; i < num_packets/4; ++i) {
- internal::pstore(dst, internal::pload<Packet>(src));
- dst += psize; src += psize;
- internal::pstore(dst, internal::pload<Packet>(src));
- dst += psize; src += psize;
- internal::pstore(dst, internal::pload<Packet>(src));
- dst += psize; src += psize;
- internal::pstore(dst, internal::pload<Packet>(src));
- dst += psize; src += psize;
- }
- for (size_t i=0; i < num_packets%4; ++i) {
- internal::pstore(dst, internal::pload<Packet>(src));
- dst += psize; src += psize;
- }
- dst += lddst - num_packets*psize;
- src += ldsrc - num_packets*psize;
- EIGEN_ASM_COMMENT("end pack_simple inner copy");
- }
- } else {
- // Naive memcpy calls
- for (Index col = 0; col < cols; ++col) {
- memcpy(dst + col*lddst, src + col*ldsrc, rows*sizeof(Scalar));
- }
- }
-}
-
-template<typename LhsScalar, typename RhsScalar, typename Scalar>
- struct libxsmm_wrapper {
- libxsmm_wrapper() {}
- libxsmm_wrapper(int, int, int, int, int, int, int, float, float, int) {}
- void operator()(const LhsScalar*, const RhsScalar*, Scalar*) {}
- void operator()(const LhsScalar*, const RhsScalar*, Scalar*, const LhsScalar*, const RhsScalar*, const Scalar*) {}
- };
-
- template<>
- struct libxsmm_wrapper<float, float, float>: public libxsmm_mmfunction<float> {
- libxsmm_wrapper(): libxsmm_mmfunction() {}
- libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) :
- libxsmm_mmfunction(flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch) {}
- };
-
- template<>
- struct libxsmm_wrapper<double, double, double>: public libxsmm_mmfunction<double> {
- libxsmm_wrapper(): libxsmm_mmfunction() {}
- libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) :
- libxsmm_mmfunction(flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch) {}
- };
-#endif
-
template<typename Dimensions, typename LhsXprType, typename RhsXprType>
struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
@@ -222,7 +158,7 @@ struct TensorContractionEvaluatorBase
m_device(device),
m_result(NULL) {
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
- static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
+ static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -381,8 +317,6 @@ struct TensorContractionEvaluatorBase
}
}
- EnableXSMMIfPossible(eval_op_indices);
-
// If the layout is RowMajor, we need to reverse the m_dimensions
if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) {
for (int i = 0, j = NumDims - 1; i < j; i++, j--) {
@@ -393,7 +327,7 @@ struct TensorContractionEvaluatorBase
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar * data) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
m_leftImpl.evalSubExprsIfNeeded(NULL);
m_rightImpl.evalSubExprsIfNeeded(NULL);
if (data) {
@@ -488,13 +422,6 @@ struct TensorContractionEvaluatorBase
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const {
- #if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
- if (m_can_use_xsmm) {
- evalGemmXSMM(buffer);
- return;
- }
- #endif
-
// columns in left side, rows in right side
const Index k = this->m_k_size;
@@ -611,214 +538,7 @@ struct TensorContractionEvaluatorBase
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; }
-protected:
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void EnableXSMMIfPossible(const array<IndexPair<Index>, ContractDims>& eval_op_indices) {
- m_can_use_xsmm = false;
-
-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
- typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
- typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
- if (!std::is_same<Scalar, LhsScalar>::value ||
- !std::is_same<Scalar, RhsScalar>::value ||
- !(std::is_same<Scalar, float>::value ||
- std::is_same<Scalar, double>::value) ||
- m_leftImpl.data() == NULL ||
- m_rightImpl.data() == NULL) {
- return;
- }
-
- // Check if we can use faster matmul algorithms. For contraction to be
- // equivalent to matmul, we need both lhs and rhs contracting dims sequences
- // to be either a prefix or suffix of all dims. Also, the order of both
- // must be the same, so we don't have to do reordering.
- // For example:
- // * OK: lhs 4D, rhs 4D, contraction: [(0, 2), (1, 3)]
- // * BAD: lhs 3D, rhs 3D, contraction: [(1,1)]
- // * BAD: lhs 3D, rhs 3D, contraction: [(0, 0), (2, 2)]
- // * BAD: lhs 3D, rhs 3D, contraction: [(0, 2), (1, 1)]
- // Depending if contraction dims are prefix or suffix of all dims we need to
- // pre-transpose matrices in matmul algorithm:
- // lhs: prefix -> transpose, suffix -> no transpose
- // rhs: prefix -> no transpose, suffix -> transpose
- // For example, for lhs 2D, rhs 2D, contraction [(1, 0)] is regular,
- // non-transposed matmul.
- if (ContractDims == 0) {
- // This case is totally uninteresting, filter it out to avoid problems
- // with iterations in further tests.
- return;
- }
-
- // Check if RHS dims list is increasing. LHS already is, so if not, the
- // order is different and we cannot do matmul.
- for (int i = 1; i < ContractDims; i++) {
- if (eval_op_indices[i].second < eval_op_indices[i-1].second) {
- return;
- }
- }
-
- // Check if no holes.
- int diff;
- for (int i = 1; i < ContractDims; i++) {
- // LHS contract dims are sorted to form an increasing seq.
- diff = eval_op_indices[i].first - eval_op_indices[i-1].first;
- if (diff != 1) {
- return;
- }
- // Now we may already assume RHS contract dims seq is increasing too.
- diff = eval_op_indices[i].second - eval_op_indices[i-1].second;
- if (diff != 1) {
- return;
- }
- }
-
- // Check if suffix or prefix.
- if (eval_op_indices[0].first != 0 &&
- eval_op_indices[ContractDims-1].first != LDims-1) {
- return;
- }
- if (eval_op_indices[0].second != 0 &&
- eval_op_indices[ContractDims-1].second != RDims-1) {
- return;
- }
-
- m_can_use_xsmm = true;
-#else
- EIGEN_UNUSED_VARIABLE(eval_op_indices);
-#endif
- }
-
-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
- EIGEN_DEVICE_FUNC void evalGemmXSMM(Scalar* buffer) const {
- // columns in left side, rows in right side
- const Index k = this->m_k_size;
-
- // rows in left side
- const Index m = this->m_i_size;
-
- // columns in right side
- const Index n = this->m_j_size;
-
- const bool transposeA = !m_lhs_inner_dim_contiguous;
- const bool transposeB = !m_rhs_inner_dim_contiguous;
-
- typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
- typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-
- internal::TensorXsmmContractionBlocking<LhsScalar, RhsScalar, Index> blocking(
- k, m, n, 1, transposeA, transposeB);
-
- // Outer blocks sizes
- const Index mc_outer = blocking.outer_m();
- const Index nc_outer = blocking.outer_n();
- const Index kc_outer = blocking.outer_k();
- // Inner blocks sizes
- const Index mc = blocking.mc();
- const Index nc = blocking.nc();
- const Index kc = blocking.kc();
- // Decisions whether we should copy parts of matrices
- const bool copyA = blocking.copyA();
- const bool copyB = blocking.copyB();
-
- const LhsScalar* leftData = m_leftImpl.data();
- const RhsScalar* rightData = m_rightImpl.data();
-
- const libxsmm_blasint stride_A = static_cast<libxsmm_blasint>(transposeA ? k : m);
- const libxsmm_blasint stride_B = static_cast<libxsmm_blasint>(transposeB ? n : k);
- const libxsmm_blasint stride_C = static_cast<libxsmm_blasint>(m);
-
- const libxsmm_blasint stride_blockA = static_cast<libxsmm_blasint>(mc);
- // Use bigger stride to avoid hitting same cache line too often.
- // This consistently gives +~0.5 Gflops.
- const libxsmm_blasint stride_panelB = static_cast<libxsmm_blasint>(
- kc % 32 == 0 ? kc + 16 : kc
- );
-
- // Kernel for the general case (not edges)
- internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar> kernel;
-
- LhsScalar* blockA = NULL;
- RhsScalar* panelB = NULL;
-
- if (copyA) {
- blockA = static_cast<LhsScalar*>(this->m_device.allocate(mc * kc * sizeof(LhsScalar)));
- }
- if (copyB) {
- panelB = static_cast<RhsScalar*>(this->m_device.allocate(nc_outer * stride_panelB * sizeof(RhsScalar)));
- }
-
- const Index kernel_stride_A = copyA ? stride_blockA : stride_A;
- const Index kernel_stride_B = copyB ? stride_panelB : stride_B;
- kernel = internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(0, mc, nc, kc, kernel_stride_A, kernel_stride_B, stride_C, 1, 1, blocking.prefetch());
-
- // Outer blocking
- for (Index ki_outer = 0; ki_outer < k; ki_outer += kc_outer) {
- for (Index mi_outer = 0; mi_outer < m; mi_outer += mc_outer) {
- for (Index ni_outer = 0; ni_outer < n; ni_outer += nc_outer) {
- using numext::mini;
-
- Index actual_nc_outer = mini(ni_outer+nc_outer, n) - ni_outer;
-
- // Inner blocking
- for (Index ki = ki_outer; ki < mini(ki_outer+kc_outer, k); ki += kc) {
- const Index actual_kc = mini(ki_outer+kc_outer, mini(ki+kc, k)) - ki;
- const float beta = ki == 0 ? 0 : 1;
-
- if (copyB) {
- if (transposeB) {
- libxsmm_otrans(panelB, rightData + ki*stride_B + ni_outer, sizeof(RhsScalar), actual_nc_outer, actual_kc, stride_B, stride_panelB);
- } else {
- internal::pack_simple<RhsScalar, Index>(panelB, rightData + ni_outer*stride_B + ki, actual_nc_outer, actual_kc, stride_panelB, stride_B);
- }
- }
-
- for (Index mi = mi_outer; mi < mini(mi_outer+mc_outer, m); mi += mc) {
- const Index actual_mc = mini(mi_outer+mc_outer, mini(mi+mc, m)) - mi;
-
- const LhsScalar* a = transposeA ? leftData + mi*stride_A + ki :
- leftData + ki*stride_A + mi;
-
- if (copyA) {
- if (transposeA) {
- libxsmm_otrans(blockA, a, sizeof(LhsScalar), actual_kc, actual_mc, stride_A, stride_blockA);
- } else {
- internal::pack_simple<LhsScalar, Index>(blockA, a, actual_kc, actual_mc, stride_blockA, stride_A);
- }
- }
- const LhsScalar* actual_a = copyA ? blockA : a;
-
- for (Index ni = ni_outer; ni < mini(ni_outer+nc_outer, n); ni += nc) {
- const Index actual_nc = mini(ni_outer+nc_outer, mini(ni+nc, n)) - ni;
-
- const RhsScalar* b = rightData + ni*stride_B + ki;
- Scalar* c = buffer + ni*stride_C + mi;
- const Scalar* cp = c + nc*stride_C;
-
- const RhsScalar* actual_b = copyB ? panelB + (ni-ni_outer)*stride_panelB : b;
- const RhsScalar* bp = copyB ? panelB + nc*stride_panelB : b + nc*stride_B;
-
- if (actual_mc == mc && actual_kc == kc && actual_nc == nc && beta == 1) {
- // Most used, cached kernel.
- kernel(actual_a, actual_b, c, actual_a, bp, cp);
- } else {
- // Edges - use libxsmm kernel cache.
- internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(0, actual_mc, actual_nc, actual_kc, kernel_stride_A, kernel_stride_B, stride_C, 1, beta, blocking.prefetch())(actual_a, actual_b, c, actual_a, bp, cp);
- }
- }
- }
- }
- }
- }
- }
-
- if (copyA) {
- this->m_device.deallocate(blockA);
- }
- if (copyB) {
- this->m_device.deallocate(panelB);
- }
- }
-#endif
-
+ protected:
// Prevent assignment
TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&);
Dimensions m_dimensions;
@@ -844,7 +564,6 @@ protected:
TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
const Device& m_device;
Scalar* m_result;
- bool m_can_use_xsmm;
};
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
index d34f9ca..5cf7b4f 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
@@ -50,140 +50,6 @@ class TensorContractionBlocking {
};
-
-#if defined(EIGEN_USE_LIBXSMM)
-template <typename LhsScalar, typename RhsScalar, typename Index>
-class TensorXsmmContractionBlocking {
- public:
- TensorXsmmContractionBlocking(Index k, Index m, Index n,
- size_t max_num_threads = 1, bool transposeA = false,
- bool transposeB = false):
- k_(k), m_(m), n_(n), transposeA_(transposeA),
- transposeB_(transposeB), num_threads_(max_num_threads) {
-#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
- if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
- mc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M;
- kc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K;
- nc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N;
- outer_m_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_M;
- outer_k_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_K;
- outer_n_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_N;
- copyA_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_A;
- copyB_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_B;
- outer_m_ = outer_m_ != 0 ? outer_m_ : m;
- outer_k_ = outer_k_ != 0 ? outer_k_ : k;
- outer_n_ = outer_n_ != 0 ? outer_n_ : n;
- }
-#else
- // Defaults, possibly overriden per-platform.
- copyA_ = true;
- copyB_ = false;
-
- // If the matrix is small enough, don't do blocking, just call single xsmm
- // kernel.
- if (static_cast<double>(m)*k*n <= LIBXSMM_THRESHOLD) {
- mc_ = m; kc_ = k; nc_ = n;
- outer_m_ = m; outer_k_ = k; outer_n_ = n;
- copyA_ = false; copyB_ = false;
- } else {
- int arch = libxsmm_cpuid_x86();
-
- if (arch == LIBXSMM_X86_AVX512_CORE) {
- // skylake
- mc_ = 64; kc_ = 64; nc_ = 24;
- outer_m_ = 512; outer_k_ = 512; outer_n_ = 24*22;
- // Hack to use this kernel architecture as the other one has performance
- // issues (no hardware prefetching).
- // TODO(nishantpatil): This should be removed if the issues are fixed,
- // or this one becomes the default.
- setenv("LIBXSMM_AVX512_CLASSIC_GEMM", "1", 1);
- } else if (arch == LIBXSMM_X86_AVX2) {
- // haswell
- mc_ = 32; kc_ = 192; nc_ = 33;
- outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 33*16;
- } else if (arch == LIBXSMM_X86_AVX) {
- // ivybridge
- mc_ = 32; kc_ = 192; nc_ = 48;
- outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 48*11;
- } else {
- // generic kernel size, usually performing well
- mc_ = 32; kc_ = 128; nc_ = 32;
- outer_m_ = 512; outer_k_ = 512; outer_n_ = 512;
- }
-
- // Only copy if it makes the stride smaller.
- copyA_ = copyA_ && (m > mc_);
- copyB_ = copyB_ && (k > kc_);
- }
-
- // We need to copy anyway if transposing
- copyA_ = copyA_ || transposeA;
- copyB_ = copyB_ || transposeB;
-
- // See libxsmm_gemm_prefetch_type definition in libxsmm_typedefs.h
- prefetch_ = LIBXSMM_PREFETCH_AL2CL2BL2_VIA_C;
-
-#endif
-
- mc_ = mc_ > m ? m : mc_;
- nc_ = nc_ > n ? n : nc_;
- kc_ = kc_ > k ? k : kc_;
-
- size_t compute_parallelism = (m / mc_) * (n / nc_);
- size_t pack_parallelism = 0;
- if (copyA_) {
- pack_parallelism += (m / mc_) * (k / kc_);
- }
- if (copyB_) {
- pack_parallelism += (n / nc_) * (k / kc_);
- }
- size_t parallelism = numext::maxi(compute_parallelism, pack_parallelism);
-
- num_threads_ = numext::mini<size_t>(num_threads_,
- parallelism / MIN_JOBS_PER_THREAD);
- num_threads_ = numext::maxi<size_t>(num_threads_, 1);
-
- // For optimal performance outer block sizes should be multiplies of kernel
- // sizes, or bigger than matrix size (=no outer blocking).
- eigen_assert(outer_m_ % mc_ == 0 || outer_m_ >= m);
- eigen_assert(outer_k_ % kc_ == 0 || outer_k_ >= k);
- eigen_assert(outer_n_ % nc_ == 0 || outer_n_ >= n);
- }
-
- EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
- EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
- EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
- EIGEN_ALWAYS_INLINE Index outer_k() const { return outer_k_; }
- EIGEN_ALWAYS_INLINE Index outer_m() const { return outer_m_; }
- EIGEN_ALWAYS_INLINE Index outer_n() const { return outer_n_; }
- EIGEN_ALWAYS_INLINE bool copyA() const { return copyA_; }
- EIGEN_ALWAYS_INLINE bool copyB() const { return copyB_; }
- EIGEN_ALWAYS_INLINE bool transposeA() const { return transposeA_; }
- EIGEN_ALWAYS_INLINE bool transposeB() const { return transposeB_; }
- EIGEN_ALWAYS_INLINE int num_threads() const { return num_threads_; }
- EIGEN_ALWAYS_INLINE Index blocks_m() const { return divup(m_, mc_); }
- EIGEN_ALWAYS_INLINE Index blocks_k() const { return divup(k_, kc_); }
- EIGEN_ALWAYS_INLINE Index blocks_n() const { return divup(n_, nc_); }
- EIGEN_ALWAYS_INLINE libxsmm_gemm_prefetch_type prefetch() const {
- return prefetch_;
- }
-
- private:
- Index k_, m_, n_;
- Index kc_, mc_, nc_;
- Index outer_k_, outer_m_, outer_n_;
- bool copyA_, copyB_, transposeA_, transposeB_;
- size_t num_threads_;
-
- // Threshold for m*k*n to skip blocking and just call libxsmm
- const double LIBXSMM_THRESHOLD = 80*80*80;
- // For computing optimal number of threads - so that each thread gets at least
- // that many jobs.
- const double MIN_JOBS_PER_THREAD = 3;
- libxsmm_gemm_prefetch_type prefetch_;
-};
-#endif // EIGEN_USE_LIBXSMM
-
} // end namespace internal
} // end namespace Eigen
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index c04b784..d65dbb4 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -529,6 +529,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
float2 rhs_shmem2[][8], const Index m_size,
const Index n_size, const Index k_size,
const Index base_m, const Index base_n) {
+ typedef float Scalar;
// prefetch registers
float4 lhs_pf0, rhs_pf0;
@@ -539,27 +540,27 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
}
-#define prefetch_lhs(reg, row, col) \
- if (!CHECK_LHS_BOUNDARY) { \
- if (col < k_size) { \
- reg =lhs.template loadPacket<Unaligned>(row, col); \
- } \
- } else { \
- if (col < k_size) { \
- if (row + 3 < m_size) { \
- reg =lhs.template loadPacket<Unaligned>(row, col); \
- } else if (row + 2 < m_size) { \
- reg.x =lhs(row + 0, col); \
- reg.y =lhs(row + 1, col); \
- reg.z =lhs(row + 2, col); \
- } else if (row + 1 < m_size) { \
- reg.x =lhs(row + 0, col); \
- reg.y =lhs(row + 1, col); \
- } else if (row < m_size) { \
- reg.x =lhs(row + 0, col); \
- } \
- } \
- } \
+#define prefetch_lhs(reg, row, col) \
+ if (!CHECK_LHS_BOUNDARY) { \
+ if (col < k_size) { \
+ reg =lhs.loadPacket<Unaligned>(row, col); \
+ } \
+ } else { \
+ if (col < k_size) { \
+ if (row + 3 < m_size) { \
+ reg =lhs.loadPacket<Unaligned>(row, col); \
+ } else if (row + 2 < m_size) { \
+ reg.x =lhs(row + 0, col); \
+ reg.y =lhs(row + 1, col); \
+ reg.z =lhs(row + 2, col); \
+ } else if (row + 1 < m_size) { \
+ reg.x =lhs(row + 0, col); \
+ reg.y =lhs(row + 1, col); \
+ } else if (row < m_size) { \
+ reg.x =lhs(row + 0, col); \
+ } \
+ } \
+ } \
Index lhs_vert = base_m+threadIdx.x*4;
@@ -577,7 +578,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
if (!CHECK_RHS_BOUNDARY) {
if ((rhs_vert + 3) < k_size) {
// just CHECK_RHS_BOUNDARY
- rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+ rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
} else if (rhs_vert + 2 < k_size) {
// just CHECK_RHS_BOUNDARY
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -592,7 +593,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
} else {
if (rhs_horiz0 < n_size) {
if ((rhs_vert + 3) < k_size) {
- rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+ rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
} else if ((rhs_vert + 2) < k_size) {
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
@@ -765,6 +766,7 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
float2 rhs_shmem2[][8], const Index m_size,
const Index n_size, const Index k_size,
const Index base_m, const Index base_n) {
+ typedef float Scalar;
// prefetch registers
float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
@@ -788,37 +790,37 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
if (!CHECK_LHS_BOUNDARY) {
if ((threadIdx.y/4+k+24) < k_size) {
- lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
- lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
- lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
- lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
+ lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+ lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
} else if ((threadIdx.y/4+k+16) < k_size) {
- lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
- lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
- lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+ lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
} else if ((threadIdx.y/4+k+8) < k_size) {
- lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
- lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
} else if ((threadIdx.y/4+k) < k_size) {
- lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
}
} else {
// just CHECK_LHS_BOUNDARY
if (lhs_vert + 3 < m_size) {
if ((threadIdx.y/4+k+24) < k_size) {
- lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
- lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
- lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
- lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
+ lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+ lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
} else if ((threadIdx.y/4+k+16) < k_size) {
- lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
- lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
- lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+ lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
} else if ((threadIdx.y/4+k+8) < k_size) {
- lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
- lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
} else if ((threadIdx.y/4+k) < k_size) {
- lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
}
} else if (lhs_vert + 2 < m_size) {
if ((threadIdx.y/4+k+24) < k_size) {
@@ -907,8 +909,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
if (!CHECK_RHS_BOUNDARY) {
if ((rhs_vert + 3) < k_size) {
// just CHECK_RHS_BOUNDARY
- rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
- rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
+ rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+ rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
} else if (rhs_vert + 2 < k_size) {
// just CHECK_RHS_BOUNDARY
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -930,8 +932,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
if (rhs_horiz1 < n_size) {
if ((rhs_vert + 3) < k_size) {
// just CHECK_RHS_BOUNDARY
- rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
- rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
+ rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+ rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
} else if (rhs_vert + 2 < k_size) {
// just CHECK_RHS_BOUNDARY
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -952,7 +954,7 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
} else if (rhs_horiz0 < n_size) {
if ((rhs_vert + 3) < k_size) {
// just CHECK_RHS_BOUNDARY
- rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+ rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
} else if ((rhs_vert + 2) < k_size) {
// just CHECK_RHS_BOUNDARY
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -1135,6 +1137,9 @@ EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
typedef float2 LHS_MEM[64][32];
typedef float2 RHS_MEM[128][8];
+ typedef float2 LHS_MEM16x16[32][16];
+ typedef float2 RHS_MEM16x16[64][8];
+
const Index m_block_idx = blockIdx.x;
const Index n_block_idx = blockIdx.y;
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
index ab320a5..9b2cb3f 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@@ -22,14 +22,8 @@ enum {
/*
* Implementation of the Eigen blas_data_mapper class for tensors.
*/
-/// The make pointer class is used by sycl in order to build the mapper class on the device. For other platform the default make pointer is used which
-/// is scalar * for CoeffLoader.
-template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_ = MakePointer> struct CoeffLoader;
-template<typename Scalar, typename Index, int side, typename Tensor, typename nocontract_t, typename contract_t,
- int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
- template <class> class MakePointer_ = MakePointer> class BaseTensorContractionMapper;
-
-template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_> struct CoeffLoader {
+
+template <typename Tensor, bool HasRawAccess> struct CoeffLoader {
enum {
DirectOffsets = false
};
@@ -53,7 +47,7 @@ template <typename Tensor, bool HasRawAccess, template <class> class MakePointer
const Tensor m_tensor;
};
-template <typename Tensor, template <class> class MakePointer_> struct CoeffLoader<Tensor, true, MakePointer_> {
+template <typename Tensor> struct CoeffLoader<Tensor, true> {
enum {
DirectOffsets = true
};
@@ -73,14 +67,13 @@ template <typename Tensor, template <class> class MakePointer_> struct CoeffLoad
}
private:
typedef typename Tensor::Scalar Scalar;
-
- typename MakePointer_<const Scalar>::Type m_data;
+ const Scalar* m_data;
};
template<typename Scalar, typename Index, int side,
typename Tensor,
typename nocontract_t, typename contract_t,
- int packet_size, bool inner_dim_contiguous, int Alignment, template <class> class MakePointer_ = MakePointer>
+ int packet_size, bool inner_dim_contiguous, int Alignment>
class SimpleTensorContractionMapper {
public:
EIGEN_DEVICE_FUNC
@@ -96,7 +89,7 @@ class SimpleTensorContractionMapper {
m_k_strides(k_strides) { }
enum {
- DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>::DirectOffsets
+ DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess>::DirectOffsets
};
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) {
@@ -213,22 +206,23 @@ class SimpleTensorContractionMapper {
}
protected:
- CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_> m_tensor;
+ CoeffLoader<Tensor, Tensor::RawAccess> m_tensor;
const nocontract_t m_nocontract_strides;
const nocontract_t m_ij_strides;
const contract_t m_contract_strides;
const contract_t m_k_strides;
};
+
template<typename Scalar, typename Index, int side,
typename Tensor,
typename nocontract_t, typename contract_t,
int packet_size, bool inner_dim_contiguous,
- bool inner_dim_reordered, int Alignment, template <class> class MakePointer_>
-class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment, MakePointer_>
+ bool inner_dim_reordered, int Alignment>
+class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment>
{
public:
- typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment, MakePointer_> ParentMapper;
+ typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment> ParentMapper;
EIGEN_DEVICE_FUNC
BaseTensorContractionMapper(const Tensor& tensor,
@@ -241,9 +235,9 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar,
typedef typename Tensor::PacketReturnType Packet;
typedef typename unpacket_traits<Packet>::half HalfPacket;
- template <typename PacketT,int AlignmentType>
+ template <int AlignmentType>
EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const {
+ EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
// whole method makes column major assumption
// don't need to add offsets for now (because operator handles that)
@@ -281,13 +275,7 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar,
}
data[packet_size - 1] = this->m_tensor.coeff(last);
- return pload<PacketT>(data);
- }
-
- template <int AlignmentType>
- EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
- return this->load<Packet,AlignmentType>(i,j);
+ return pload<Packet>(data);
}
template <int AlignmentType>
@@ -313,11 +301,11 @@ template<typename Scalar, typename Index, int side,
typename Tensor,
typename nocontract_t, typename contract_t,
bool inner_dim_contiguous,
- bool inner_dim_reordered, int Alignment, template <class> class MakePointer_>
-class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_>
+ bool inner_dim_reordered, int Alignment>
+class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment>
{
public:
- typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_> ParentMapper;
+ typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment> ParentMapper;
EIGEN_DEVICE_FUNC
BaseTensorContractionMapper(const Tensor& tensor,
@@ -334,12 +322,6 @@ class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, con
data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
return pload<typename Tensor::PacketReturnType>(data);
}
- template <typename PacketT,int> EIGEN_DEVICE_FUNC
- EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const {
- EIGEN_ALIGN_MAX Scalar data[1];
- data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
- return pload<PacketT>(data);
- }
template <int> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const {
return loadPacket(i, j);
@@ -351,14 +333,14 @@ template<typename Scalar, typename Index, int side,
typename Tensor,
typename nocontract_t, typename contract_t,
int packet_size,
- bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, template <class> class MakePointer_=MakePointer>
+ bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
class TensorContractionSubMapper {
public:
typedef typename Tensor::PacketReturnType Packet;
typedef typename unpacket_traits<Packet>::half HalfPacket;
- typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> ParentMapper;
- typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Self;
+ typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper;
+ typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
typedef Self LinearMapper;
enum {
@@ -403,14 +385,6 @@ class TensorContractionSubMapper {
return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, j + m_horiz_offset);
}
- template <typename PacketT, int AlignmentType>
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const {
- if (UseDirectOffsets) {
- return m_base_mapper.template load<PacketT,AlignmentType>(i, j);
- }
- return m_base_mapper.template loadPacket<PacketT,AlignmentType>(i + m_vert_offset, j + m_horiz_offset);
- }
-
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
if (UseDirectOffsets) {
return m_base_mapper.template loadHalfPacket<Alignment>(i, 0);
@@ -418,7 +392,7 @@ class TensorContractionSubMapper {
return m_base_mapper.template loadHalfPacket<Alignment>(i + m_vert_offset, m_horiz_offset);
}
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet& p) const {
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
if (UseDirectOffsets) {
m_base_mapper.storePacket(i, 0, p);
}
@@ -458,14 +432,14 @@ template<typename Scalar_, typename Index, int side,
typename Tensor,
typename nocontract_t, typename contract_t,
int packet_size,
- bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, template <class> class MakePointer_=MakePointer>
+ bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
class TensorContractionInputMapper
- : public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> {
+ : public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> {
public:
typedef Scalar_ Scalar;
- typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Base;
- typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> SubMapper;
+ typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Base;
+ typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
typedef SubMapper VectorMapper;
EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor,
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
deleted file mode 100644
index e87de0c..0000000
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ /dev/null
@@ -1,400 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli Codeplay Software Ltd.
-// Ralph Potter Codeplay Software Ltd.
-// Luke Iwanski Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclConvertToDeviceExpression.h
- *
- * \brief:
- * TensorContractionsycl
- *
-*****************************************************************/
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
-namespace Eigen {
-
-template <typename Index, typename LhsScalar, typename RhsScalar,bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels;
-template<typename Indices, typename LeftArgType, typename RightArgType>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, const Eigen::SyclDevice> :
- public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, const Eigen::SyclDevice> > {
-
- typedef const Eigen::SyclDevice Device;
-
- typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
- typedef TensorContractionEvaluatorBase<Self> Base;
- typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
- typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
- typedef typename XprType::Index Index;
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
- enum {
- Layout = TensorEvaluator<LeftArgType, Device>::Layout,
- };
-
- // Most of the code is assuming that both input tensors are ColMajor. If the
- // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
- // If we want to compute A * B = C, where A is LHS and B is RHS, the code
- // will pretend B is LHS and A is RHS.
- typedef typename internal::conditional<
- static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
- typedef typename internal::conditional<
- static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
-
- static const int LDims =
- internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
- static const int RDims =
- internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
- static const int ContractDims = internal::array_size<Indices>::value;
-
- typedef array<Index, LDims> left_dim_mapper_t;
- typedef array<Index, RDims> right_dim_mapper_t;
-
- typedef array<Index, ContractDims> contract_t;
- typedef array<Index, LDims - ContractDims> left_nocontract_t;
- typedef array<Index, RDims - ContractDims> right_nocontract_t;
-
- static const int NumDims = LDims + RDims - 2 * ContractDims;
-
- typedef DSizes<Index, NumDims> Dimensions;
-
- // typedefs needed in evalTo
- typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
- typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-
- typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
- typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
-
- typedef typename LeftEvaluator::Dimensions LeftDimensions;
- typedef typename RightEvaluator::Dimensions RightDimensions;
-
- EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
- Base(op, device) {}
-
- // We need to redefine this method to make nvcc happy
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
- this->m_leftImpl.evalSubExprsIfNeeded(NULL);
- this->m_rightImpl.evalSubExprsIfNeeded(NULL);
- if (data) {
- evalTo(data);
- return false;
- } else {
- this->m_result = static_cast<Scalar*>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
- evalTo(this->m_result);
- return true;
- }
- }
- const Eigen::SyclDevice& device() const {return this->m_device;}
- void evalTo(Scalar* buffer) const {
- // Here is the result
- if (this->m_lhs_inner_dim_contiguous) {
- if (this->m_rhs_inner_dim_contiguous) {
- if (this->m_rhs_inner_dim_reordered) {
- evalTyped<true, true, true, Unaligned>(buffer);
- }
- else {
- evalTyped<true, true, false, Unaligned>(buffer);
- }
- }
- else {
- if (this->m_rhs_inner_dim_reordered) {
- evalTyped<true, false, true, Unaligned>(buffer);
- }
- else {
- evalTyped<true, false, false, Unaligned>(buffer);
- }
- }
- }
- else {
- if (this->m_rhs_inner_dim_contiguous) {
- if (this->m_rhs_inner_dim_reordered) {
- evalTyped<false, true, true, Unaligned>(buffer);
- }
- else {
- evalTyped<false, true, false, Unaligned>(buffer);
- }
- }
- else {
- if (this->m_rhs_inner_dim_reordered) {
- evalTyped<false, false, true, Unaligned>(buffer);
- }
- else {
- evalTyped<false, false, false, Unaligned>(buffer);
- }
- }
- }
- }
-
- template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
- void evalTyped(Scalar* buffer) const {
- // columns in left side, rows in right side
- const Index k = this->m_k_size;
- EIGEN_UNUSED_VARIABLE(k)
- // rows in left side
- const Index m = this->m_i_size;
- // columns in right side
- const Index n = this->m_j_size;
-
- // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
- this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
- LaunchSyclKernels<Index, LhsScalar, RhsScalar,lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>::Run(*this, buffer, m, n, k,
- this->m_k_strides, this->m_left_contracting_strides, this->m_right_contracting_strides,
- this->m_i_strides, this->m_j_strides, this->m_left_nocontract_strides, this->m_right_nocontract_strides);
- }
- // required by sycl to construct the expr on the device. Returns original left_impl
- const TensorEvaluator<LeftArgType, Device>& left_impl() const {
- return choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), this->m_leftImpl, this->m_rightImpl);
- }
- // required by sycl to construct the expr on the device. Returns original right_impl
- const TensorEvaluator<RightArgType, Device>& right_impl() const {
- return choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), this->m_rightImpl, this->m_leftImpl);
- }
-};
-
-template <typename HostExpr, typename OutScalar, typename LhsScalar, typename RhsScalar, typename LHSFunctorExpr, typename RHSFunctorExpr, typename LhsLocalAcc, typename RhsLocalAcc, typename OutAccessor, typename Index, typename ContractT, typename LeftNocontractT,
-typename RightNocontractT, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered,
-typename HostExpr::Index TileSizeDimM, typename HostExpr::Index TileSizeDimN,typename HostExpr::Index TileSizeDimK, typename HostExpr::Index WorkLoadPerThreadM,typename HostExpr::Index WorkLoadPerThreadN,
-typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadSizeN, typename HostExpr::Index LoadPerThreadLhs, typename HostExpr::Index LoadPerThreadRhs, typename LHSTupleType, typename RHSTupleType, typename Device> struct KernelConstructor{
- typedef typename Eigen::internal::traits<HostExpr>::_LhsNested LHSHostExpr;
- typedef typename Eigen::internal::traits<HostExpr>::_RhsNested RHSHostExpr;
- typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<LHSHostExpr>::Type LHSPlaceHolderExpr;
- typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<RHSHostExpr>::Type RHSPlaceHolderExpr;
- LHSFunctorExpr lhs_functors;
- RHSFunctorExpr rhs_functors;
- LhsLocalAcc localLhs;
- RhsLocalAcc localRhs;
- OutAccessor out_res;
- Index roundUpK, M, N, K;
- ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides;
- LeftNocontractT m_i_strides, m_left_nocontract_strides;
- RightNocontractT m_j_strides, m_right_nocontract_strides;
- LHSTupleType left_tuple_of_accessors;
- RHSTupleType right_tuple_of_accessors;
- Device dev;
-
-
- KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_,
- Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_,
- ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_,
- LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, LHSTupleType left_tuple_of_accessors_, RHSTupleType right_tuple_of_accessors_, Device dev_)
- :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_),
- m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_),
- m_right_contracting_strides(m_right_contracting_strides_),
- m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_),
- m_j_strides(m_j_strides_), m_right_nocontract_strides(m_right_nocontract_strides_),
- left_tuple_of_accessors(left_tuple_of_accessors_), right_tuple_of_accessors(right_tuple_of_accessors_), dev(dev_){}
-
- void operator()(cl::sycl::nd_item<1> itemID) {
- typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
- typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<LHSHostExpr>::Type LHSDevExpr;
- typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<RHSHostExpr>::Type RHSDevExpr;
- auto lhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression<LHSDevExpr, LHSPlaceHolderExpr>(lhs_functors, left_tuple_of_accessors);
- auto rhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression<RHSDevExpr, RHSPlaceHolderExpr>(rhs_functors, right_tuple_of_accessors);
- typedef decltype(lhs_dev_expr.expr) LeftArgType;
- typedef decltype(rhs_dev_expr.expr) RightArgType;
- typedef typename internal::conditional<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
- typedef typename internal::conditional<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
- typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
- typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
- typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
- LeftEvaluator, LeftNocontractT,
- ContractT, 1,
- lhs_inner_dim_contiguous,
- false, Unaligned, MakeGlobalPointer> LhsMapper;
-
- typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
- RightEvaluator, RightNocontractT,
- ContractT, 1,
- rhs_inner_dim_contiguous,
- rhs_inner_dim_reordered, Unaligned, MakeGlobalPointer> RhsMapper;
- // initialize data mappers must happen inside the kernel for device eval
- LhsMapper lhs(LeftEvaluator(choose(Cond<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor)>(),
- lhs_dev_expr.expr, rhs_dev_expr.expr), dev), m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides);
- RhsMapper rhs(RightEvaluator(choose(Cond<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor)>(),
- rhs_dev_expr.expr, lhs_dev_expr.expr),dev), m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides);
- auto out_ptr = ConvertToActualTypeSycl(OutScalar, out_res);
- // Matmul Kernel
- // Thread identifiers
- const Index mLocalThreadId = itemID.get_local(0); // Local ID row
- const Index nLocalThreadId = itemID.get_local(1); // Local ID col
- const Index mGroupId = itemID.get_group(0); // Work-group ID row
- const Index nGroupId = itemID.get_group(1); // Work-group ID localCol
- const Index linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID
- // Allocate register space
- float privateLhs;
- float privateRhs[WorkLoadPerThreadN];
- float privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN];
- // Initialise the privateResumulation registers
- for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
- for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
- privateRes[wLPTM][wLPTN] = 0.0f;
- }
- }
-
- // Tile Lhs
- for (Index lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
- Index localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
- Index localLhsRow = localLhsLinearId% TileSizeDimM;
- Index localLhsCol = localLhsLinearId/TileSizeDimM;
- // Load the value (wide vector load)
- Index GlobalLhsColId = TileSizeDimK*0 + localLhsCol;
- localLhs[0 + ((localLhsCol*TileSizeDimM + localLhsRow)*2)] =((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId):static_cast<OutScalar>(0);
- }
- // Tile Rhs
- for (Index lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
- Index localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
- Index localRhsRow = localRhsLinearId% TileSizeDimN;
- Index localRhsCol = localRhsLinearId/TileSizeDimN;
- // Load the value (wide vector load)
- Index GlobalRhsRowId = TileSizeDimK*0 + localRhsCol;
- localRhs[0 + ((localRhsCol*TileSizeDimN + localRhsRow) *2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow): static_cast<OutScalar>(0);
-
- }
- // Loop over all tiles
- const Index numTiles = roundUpK/TileSizeDimK;
- Index firstHalf=0;
- do {
- // Synchronise
- itemID.barrier(cl::sycl::access::fence_space::local_space);
- // Load the next tile of Lhs and Rhs into local memory
- Index nextHalf = firstHalf + 1;
- if (nextHalf < numTiles) {
- // Tile A
- for (Index lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
- Index localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
- Index localLhsRow = localLhsLinearId% TileSizeDimM;
- Index localLhsCol = localLhsLinearId/TileSizeDimM;
- // global K id
- Index GlobalLhsColId = TileSizeDimK*nextHalf + localLhsCol;
- // Store the loaded value into local memory
- localLhs[(nextHalf%2) + ((localLhsCol*TileSizeDimM + localLhsRow) *2)] = ((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId): static_cast<OutScalar>(0);
- }
- // Tile B
- for (Index lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
- Index localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
- Index localRhsRow = localRhsLinearId% TileSizeDimN;
- Index localRhsCol = localRhsLinearId/TileSizeDimN;
- // Load the value (wide vector load)
- Index GlobalRhsRowId = TileSizeDimK*nextHalf + localRhsCol;
- // Store the loaded vector into local memory
- localRhs[(nextHalf%2) +((localRhsCol*TileSizeDimN + localRhsRow)*2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow):static_cast<OutScalar>(0);
- }
- }
- // Loop over the values of a single tile
- for (Index k=0; k<TileSizeDimK; k++) {
- // Cache the values of localRhs in registers
- for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
- Index localRhsCol = nLocalThreadId + wLPTN*LocalThreadSizeN;
- privateRhs[wLPTN] = localRhs[(firstHalf%2) +((k*TileSizeDimN + localRhsCol)*2)];
- }
- // Perform the computation
- for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
- Index localLhsRow = mLocalThreadId + wLPTM*LocalThreadSizeM;
- privateLhs = localLhs[(firstHalf%2)+ ((k*TileSizeDimM + localLhsRow)*2)];
- for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
- privateRes[wLPTM][wLPTN] += privateLhs * privateRhs[wLPTN];
- }
- }
- }
- // Next tile
- firstHalf++;
- } while (firstHalf<numTiles);
-
- // Store the final results in C
- for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
- Index globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM;
- if (globalRow< M){
- for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
- Index globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN;
- if(globalCol<N)
- out_ptr[globalCol*M + globalRow] = privateRes[wLPTM][wLPTN];
- }
- }
- }
-
- }
-
-};
-template <typename Index, typename LhsScalar, typename RhsScalar, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels {
-
-static const Index TileSizeDimM = 32ul; // Tile size for dimension M
-static const Index TileSizeDimN = 32ul; // Tile size for dimension N
-static const Index TileSizeDimK = 16ul; // Tile size for dimension K
-static const Index WorkLoadPerThreadM = 4ul; // Work load per thread in dimension M
-static const Index WorkLoadPerThreadN = 4ul; // work load per thread in dimension N
-static const Index LocalThreadSizeM = (TileSizeDimM/WorkLoadPerThreadM); // Local thread size for the first dimension (M here)
-static const Index LocalThreadSizeN = (TileSizeDimN/WorkLoadPerThreadN); // Local thread size for the second dimension (N here)
-static const Index LoadPerThreadLhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimN)); // workload per thread for Lhs expression
-static const Index LoadPerThreadRhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimM)); // workload per thread for Rhs expression
-
-// RoundUp function to make sure that the global threadId is divisable by local threadId
-static Index RoundUp(Index x, Index y) {
- return ((((x) + (y) - 1) / (y))*(y));
-}
-
-template< typename Self, typename OutScalar, typename ContractT, typename LeftNocontractT, typename RightNocontractT>
- static void Run(const Self& self, OutScalar* buffer, Index M, Index N, Index K,
- ContractT m_k_strides, ContractT m_left_contracting_strides, ContractT m_right_contracting_strides,
- LeftNocontractT m_i_strides, RightNocontractT m_j_strides, LeftNocontractT m_left_nocontract_strides, RightNocontractT m_right_nocontract_strides){
-
- typedef typename Self::XprType HostExpr;
- typedef typename Eigen::internal::traits<HostExpr>::_LhsNested LHSHostExpr;
- typedef typename Eigen::internal::traits<HostExpr>::_RhsNested RHSHostExpr;
- typedef TensorEvaluator<LHSHostExpr, const Eigen::SyclDevice> OrigLHSExpr;
- typedef TensorEvaluator<RHSHostExpr, const Eigen::SyclDevice> OrigRHSExpr;
- typedef Eigen::TensorSycl::internal::FunctorExtractor<OrigLHSExpr> LHSFunctorExpr;
- typedef Eigen::TensorSycl::internal::FunctorExtractor<OrigRHSExpr> RHSFunctorExpr;
- // extract lhs functor list
- LHSFunctorExpr lhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl());
- // extract rhs functor list
- RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl());
-
- Index roundUpK = RoundUp(K, TileSizeDimK);
- Index roundUpM = RoundUp(M, TileSizeDimM);
- Index roundUpN = RoundUp(N, TileSizeDimN);
-
- self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) {
- /// work-around for gcc bug
- typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl())) LHSTupleType;
- /// work-around for gcc bug
- typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigRHSExpr>(cgh, self.right_impl())) RHSTupleType;
- // create lhs tuple of accessors
- LHSTupleType left_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl());
- // create rhs tuple of accessors
- RHSTupleType right_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<OrigRHSExpr>(cgh, self.right_impl());
-
- // Local memory for elements of Lhs
- typedef cl::sycl::accessor<LhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> LhsLocalAcc;
- LhsLocalAcc localLhs(cl::sycl::range<1>(2* TileSizeDimM * TileSizeDimK), cgh);
- // Local memory for elements of Rhs
- typedef cl::sycl::accessor<RhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> RhsLocalAcc;
- RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh);
-
- typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> OutAccessor;
- //OutScalar memory
- OutAccessor out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer);
-
- // sycl parallel for
- cgh.parallel_for(cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN),
- cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)),
- KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, LHSFunctorExpr, RHSFunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT,
- RightNocontractT, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, TileSizeDimM, TileSizeDimN, TileSizeDimK,
- WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::DefaultDevice>(lhs_functors, rhs_functors,
- localLhs, localRhs, out_res, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides,
- m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::DefaultDevice()));
- });
- self.device().asynchronousExec();
- }
-};
-
-} // end namespace Eigen
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index d30cc96..ee16cde 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -116,28 +116,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
bool rhs_inner_dim_reordered, int Alignment>
void evalProduct(Scalar* buffer) const {
- const Index m = this->m_i_size;
- const Index n = this->m_j_size;
- const Index k = this->m_k_size;
- if (m == 0 || n == 0 || k == 0) return;
-
-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
- if (this->m_can_use_xsmm) {
- bool transposeA = !this->m_lhs_inner_dim_contiguous;
- bool transposeB = !this->m_rhs_inner_dim_contiguous;
- internal::TensorXsmmContractionBlocking<LhsScalar, RhsScalar, Index>
- blocking(k, m, n, this->m_device.numThreads(), transposeA,
- transposeB);
-
- if (blocking.num_threads() == 1) {
- this->evalGemmXSMM(buffer);
- } else {
- ContextXsmm<Alignment>(this, buffer, m, n, k, blocking).run();
- }
- return;
- }
-#endif
-
typedef
typename internal::remove_const<typename EvalLeftArgType::Scalar>::type
LhsScalar;
@@ -169,7 +147,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
Traits::mr, Traits::nr, false, false>
GebpKernel;
-
+ const Index m = this->m_i_size;
+ const Index n = this->m_j_size;
+ const Index k = this->m_k_size;
+ if (m == 0 || n == 0 || k == 0) return;
// Compute a set of algorithm parameters:
// - kernel block sizes (bm, bn, bk)
@@ -1063,187 +1044,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
rhsCost.dropMemoryCost();
return cost + lhsCost + rhsCost;
}
-
-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
- template<int Alignment>
- class ContextXsmm {
- public:
- ContextXsmm(const Self* self, Scalar* buffer, Index m, Index n, Index k,
- const internal::TensorXsmmContractionBlocking<LhsScalar,
- RhsScalar, Index>& blocking):
- device(self->m_device),
- m(m), k(k), n(n),
- stride_a(blocking.transposeA() ? k : m),
- stride_b(blocking.transposeB() ? n : k),
- stride_c(m),
- bm(blocking.mc()), bk(blocking.kc()), bn(blocking.nc()),
- blocks_m(blocking.blocks_m()), blocks_k(blocking.blocks_k()),
- blocks_n(blocking.blocks_n()),
- copyA(blocking.copyA()), copyB(blocking.copyB()),
- transposeA(blocking.transposeA()), transposeB(blocking.transposeB()),
- num_threads(blocking.num_threads()),
- buffer(buffer),
- leftData(self->m_leftImpl.data()), rightData(self->m_rightImpl.data()),
- workers_done(blocking.num_threads()),
-
- packingA_jobs(0), packingB_jobs(0), compute_jobs(0),
- packingA_done(blocking.blocks_m()), packingB_done(blocking.blocks_n()) {}
-
- void worker() {
- // Pack
-
- if (copyA) {
- while (true) {
- uint32_t mk = packingA_jobs++;
- Index mi = mk / blocks_k;
- Index ki = mk % blocks_k;
- if (mi >= blocks_m) break;
-
- LhsScalar * blockA = blocksA + (bk*bm) * (mi*blocks_k+ki);
- if (transposeA) {
- const LhsScalar * current_a = leftData + (bm*mi)*stride_a + (bk*ki);
- libxsmm_otrans(blockA, current_a, sizeof(LhsScalar), actual_bk(ki),
- actual_bm(mi), stride_a, bm);
- } else {
- const LhsScalar * current_a = leftData + (bk*ki)*stride_a + (bm*mi);
- internal::pack_simple<LhsScalar, Index>(blockA, current_a,
- actual_bk(ki), actual_bm(mi), bm, stride_a);
- }
- packingA_done.at(mi)++;
- }
- }
-
- if (copyB) {
- while (true) {
- uint32_t nk = packingB_jobs++;
- Index ni = nk / blocks_k;
- Index ki = nk % blocks_k;
- if (ni >= blocks_n) break;
-
- RhsScalar * blockB = blocksB + (bk*bn) * (ni*blocks_k+ki);
- if (transposeB) {
- const RhsScalar * current_b = rightData + (ki*bk)*stride_b +
- (ni*bn);
- libxsmm_otrans(blockB, current_b, sizeof(RhsScalar), actual_bn(ni),
- actual_bk(ki), stride_b, bk);
- } else {
- const RhsScalar * current_b = rightData + (ni*bn)*stride_b +
- (ki*bk);
- internal::pack_simple<RhsScalar, Index>(blockB, current_b,
- actual_bn(ni), actual_bk(ki), bk, stride_b);
- }
- packingB_done.at(ni)++;
- }
- }
-
- // Compute
-
- while (true) {
- uint32_t mn = compute_jobs++;
- Index mi = mn / blocks_n;
- Index ni = mn % blocks_n;
- if (mi >= blocks_m) break;
-
- // Wait for mi, ni packings to be done. This is more fine-grained than
- // waiting for all workers to finish packing.
- while ((copyA && (packingA_done.at(mi) < blocks_k)) ||
- (copyB && (packingB_done.at(ni) < blocks_k)))
- {}
-
- for (Index ki=0; ki < blocks_k; ++ki) {
- const LhsScalar * current_a = copyA ?
- blocksA + (bk*bm) * (mi*blocks_k+ki) :
- leftData + (bk*ki)*stride_a + (bm*mi);
- const RhsScalar * current_b = copyB ?
- blocksB + (bk*bn) * (ni*blocks_k+ki) :
- rightData + (ni*bn)*stride_b + (bk*ki);
-
- Index current_stride_a = copyA ? bm : stride_a;
- Index current_stride_b = copyB ? bk : stride_b;
-
- // Memory may not be zeroed, overwrite instead of adding in first
- // iteration.
- float beta = ki == 0 ? 0 : 1;
-
- Scalar * current_c = buffer + (mi*bm) + (ni*bn)*stride_c;
- internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(
- 0, actual_bm(mi), actual_bn(ni), actual_bk(ki),
- current_stride_a, current_stride_b, stride_c, 1, beta, 0)
- (current_a, current_b, current_c);
- }
- }
-
- workers_done.Notify();
- }
-
- void run() {
- // Parallelization strategy.
- //
- // First pack A into blocks (sharding by m, k) and B (sharding by n,k),
- // then shard by m, n.
- //
- // Do not use advanced ThreadPool queuing, just run a single long-standing
- // function in each thread.
- if (copyA) {
- blocksA = static_cast<LhsScalar*>(device.allocate(
- (blocks_m*bm)*(blocks_k*bk)*sizeof(LhsScalar)));
- }
- if (copyB) {
- blocksB = static_cast<RhsScalar*>(device.allocate(
- (blocks_n*bn)*(blocks_k*bk)*sizeof(RhsScalar)));
- }
-
- for (Index i = 0; i < num_threads; ++i) {
- device.enqueueNoNotification([=]() { worker(); });
- }
-
- workers_done.Wait();
-
- if (copyA) {
- device.deallocate(blocksA);
- }
- if (copyB) {
- device.deallocate(blocksB);
- }
- }
-
- private:
- // real block size for block index in [0, ..., blocks - 1].
- Index actual_bm(Index mi) const {
- return mi != blocks_m - 1 ? bm : m + bm - bm * blocks_m;
- }
- Index actual_bk(Index ki) const {
- return ki != blocks_k - 1 ? bk : k + bk - bk * blocks_k;
- }
- Index actual_bn(Index ni) const {
- return ni != blocks_n - 1 ? bn : n + bn - bn * blocks_n;
- }
-
- const Device& device;
- Index m, k, n;
- Index stride_a, stride_b, stride_c;
- Index bm, bk, bn; // Block sizes.
- Index blocks_m, blocks_k, blocks_n; // Number of blocks in each dimension.
- bool copyA, copyB, transposeA, transposeB;
- Index num_threads;
- Scalar *buffer;
- const LhsScalar *leftData;
- const RhsScalar *rightData;
-
- LhsScalar *blocksA;
- RhsScalar *blocksB;
- // barrier for joining all threads after all done.
- Barrier workers_done;
- // "queues" of (mi,ki), (ki,ni), (mi,ni) jobs packed [0,p)x[0,q) -> [0, p*q)
- std::atomic<uint32_t> packingA_jobs;
- std::atomic<uint32_t> packingB_jobs;
- std::atomic<uint32_t> compute_jobs;
- // already packed blocks for each mi-panel in A and ni-panel in B.
- std::vector<std::atomic<uint8_t>> packingA_done;
- std::vector<std::atomic<uint8_t>> packingB_done;
- };
-#endif
-
};
} // end namespace Eigen
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index b29968b..860a694 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -246,9 +246,6 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
- /// required by sycl in order to extract the sycl accessor
- const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
-
protected:
template <int LoadMode, bool ActuallyVectorize>
struct PacketConv {
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 378f5cc..abdf742 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -100,7 +100,7 @@ class IndexMapper {
}
} else {
for (int i = NumDims - 1; i >= 0; --i) {
- if (static_cast<size_t>(i + 1) < offset) {
+ if (i + 1 < offset) {
m_cudaInputStrides[i] =
m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1];
m_cudaOutputStrides[i] =
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
deleted file mode 100644
index 4247c1c..0000000
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ /dev/null
@@ -1,476 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli Codeplay Software Ltd.
-// Ralph Potter Codeplay Software Ltd.
-// Luke Iwanski Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
-
-namespace Eigen {
-
-/** \class TensorConvolution
- * \ingroup CXX11_Tensor_Module
- *
- * \brief Tensor convolution class.
- *
- *
- */
-template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index,
-typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType>
-struct EigenConvolutionKernel1D{
-typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper;
-Kernel_accessor kernel_filter;
-const size_t kernelSize, range_x, range_y;
-Buffer_accessor buffer_acc;
-Local_accessor local_acc;
-FunctorExpr functors;
-TupleType tuple_of_accessors;
-EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
- Kernel_accessor kernel_filter_, const size_t kernelSize_, const size_t range_x_, const size_t range_y_,
- Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
- :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize(kernelSize_), range_x(range_x_), range_y(range_y_),
- buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
-
- void operator()(cl::sycl::nd_item<2> itemID) {
- typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
- auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
-
- auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
- auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
-
- const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize -1); //the required row to be calculated for the for each plane in shered memory
- const size_t plane_kernel_offset = itemID.get_local(1) * num_x_input;
- const size_t first_input_start = itemID.get_group(0)*itemID.get_local_range()[0];
- const size_t plane_tensor_offset =indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(1));
- /// fill the shared memory
- for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) {
- const size_t local_index = i + plane_kernel_offset ;
- const size_t tensor_index = plane_tensor_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_input_start);
- if(((i + first_input_start) < (range_x +kernelSize-1)) && itemID.get_global(1)< range_y){
- local_acc[local_index] = device_evaluator.coeff(tensor_index);
- }
- else local_acc[local_index]=0.0f;
- }
-
- itemID.barrier(cl::sycl::access::fence_space::local_space);
-
- // calculate the convolution
- const size_t first_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x
- if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y){
- CoeffReturnType result = static_cast<CoeffReturnType>(0);
- const size_t index = plane_kernel_offset+ itemID.get_local(0);
- for (size_t k = 0; k < kernelSize; ++k) {
- result += (local_acc[k + index] * kernel_ptr[k]);
- }
- const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(1))
- +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + first_output_start);
- buffer_ptr[tensor_index] = result;
- }
- }
-};
-
-
-template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index,
-typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType>
-struct EigenConvolutionKernel2D{
-typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper;
-Kernel_accessor kernel_filter;
-const size_t kernelSize_x, kernelSize_y, range_x, range_y , range_z;
-Buffer_accessor buffer_acc;
-Local_accessor local_acc;
-FunctorExpr functors;
-TupleType tuple_of_accessors;
-EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
- Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ ,const size_t range_x_, const size_t range_y_, const size_t range_z_,
- Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
- :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), range_x(range_x_), range_y(range_y_), range_z(range_z_),
- buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
-
- void operator()(cl::sycl::nd_item<3> itemID) {
- typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
- auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
-
- auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
- auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
- const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory
- const size_t num_y_input = (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory
- const size_t plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(2));
- const size_t plane_kernel_offset = itemID.get_local(2) * num_y_input;
-
- /// fill the shared memory
- const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0];
- const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1];
- for (size_t j = itemID.get_local(1); j < num_y_input; j += itemID.get_local_range()[1]) {
- const size_t local_input_offset = num_x_input * (j + plane_kernel_offset);
- for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) {
- const size_t local_index = i + local_input_offset;
- const size_t tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_x_input_start, j+ first_y_input_start );
- if(((i + first_x_input_start) < (range_x +kernelSize_x-1)) &&((j + first_y_input_start) < (range_y +kernelSize_y-1)) && itemID.get_global(2)< range_z){
- local_acc[local_index] = device_evaluator.coeff(tensor_index);
- }
- else local_acc[local_index]=0.0f;
- }
- }
-
- itemID.barrier(cl::sycl::access::fence_space::local_space);
-
- // calculate the convolution
- const size_t fitst_x_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x
- const size_t fitst_y_output_start =itemID.get_group(1)*(itemID.get_local_range()[1]); // output start y
- if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y && itemID.get_global(2)< range_z){
- CoeffReturnType result = static_cast<CoeffReturnType>(0);
- for (size_t j = 0; j < kernelSize_y; j++) {
- size_t kernel_offset =kernelSize_x * j;
- const size_t index = (num_x_input*(plane_kernel_offset + j+ itemID.get_local(1))) + itemID.get_local(0);
- for (size_t i = 0; i < kernelSize_x; i++) {
- result += (local_acc[i + index] * kernel_ptr[i+kernel_offset]);
- }
- }
- const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(2))
- +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start);
- buffer_ptr[tensor_index] = result;
- }
- }
-};
-
-
-
-template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index,
-typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType>
-struct EigenConvolutionKernel3D{
-typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper;
-Kernel_accessor kernel_filter;
-const size_t kernelSize_x, kernelSize_y, kernelSize_z, range_x, range_y , range_z, numP;
-Buffer_accessor buffer_acc;
-Local_accessor local_acc;
-FunctorExpr functors;
-TupleType tuple_of_accessors;
-EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
- Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ , const size_t kernelSize_z_ ,
- const size_t range_x_, const size_t range_y_, const size_t range_z_, const size_t numP_,
- Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
- :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_),
- kernelSize_z(kernelSize_z_), range_x(range_x_), range_y(range_y_), range_z(range_z_), numP(numP_),
- buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
-
- void operator()(cl::sycl::nd_item<3> itemID) {
- typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
- auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
-
- auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
- auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
- const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory
- const size_t num_y_input = (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory
- const size_t num_z_input = (itemID.get_local_range()[2] +kernelSize_z -1); //the required row to be calculated for the for each plane in shered memory
- const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0];
- const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1];
- const size_t first_z_input_start = itemID.get_group(2)*itemID.get_local_range()[2];
- for(size_t p=0; p<numP; p++){
- /// fill the shared memory
- const size_t plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
- for (size_t k = itemID.get_local(2); k < num_z_input; k += itemID.get_local_range()[2]) {
- for (size_t j = itemID.get_local(1); j < num_y_input; j += itemID.get_local_range()[1]) {
- for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) {
- const size_t local_index = i + (num_x_input * (j + (num_y_input * k)));
- const size_t tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_x_input_start, j+ first_y_input_start , k+ first_z_input_start );
- if(((i + first_x_input_start) < (range_x +kernelSize_x-1)) && ((j + first_y_input_start) < (range_y +kernelSize_y-1)) && ((k + first_z_input_start) < (range_z +kernelSize_z-1)) ){
- local_acc[local_index] = device_evaluator.coeff(tensor_index);
- }
- else local_acc[local_index]=0.0f;
- }
- }
- }
- itemID.barrier(cl::sycl::access::fence_space::local_space);
-
- // calculate the convolution
- const size_t fitst_x_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // x
- const size_t fitst_y_output_start =itemID.get_group(1)*(itemID.get_local_range()[1]); // y
- const size_t fitst_z_output_start =itemID.get_group(2)*(itemID.get_local_range()[2]); // z
-
- if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y && itemID.get_global(2)< range_z){
- CoeffReturnType result = static_cast<CoeffReturnType>(0);
- for (size_t k = 0; k < kernelSize_z; k++) {
- for (size_t j = 0; j < kernelSize_y; j++) {
- for (size_t i = 0; i < kernelSize_x; i++) {
- const size_t kernel_index =i + kernelSize_x * (j + kernelSize_y * k);
- const size_t local_index = ((i+ itemID.get_local(0))+ num_x_input*((j+ itemID.get_local(1)) + num_y_input * (k+ itemID.get_local(2))));
- result += (local_acc[local_index] * kernel_ptr[kernel_index]);
- }
- }
- }
- const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p)
- +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start, itemID.get_local(2) + fitst_z_output_start );
- buffer_ptr[tensor_index] = result;
- }
-
- itemID.barrier(cl::sycl::access::fence_space::local_space);
- }
- }
-};
-
-
-template<typename Indices, typename InputArgType, typename KernelArgType>
-struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, const Eigen::SyclDevice>
-{
- typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
-
- static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Dimensions>::value;
- static const int NumKernelDims = internal::array_size<Indices>::value;
- typedef typename XprType::Index Index;
- typedef DSizes<Index, NumDims> Dimensions;
- typedef typename TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Dimensions KernelDimensions;
- typedef const Eigen::SyclDevice Device;
-
- enum {
- IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned,
- PacketAccess = false,
- Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout,
- CoordAccess = false, // to be implemented
- RawAccess = false
- };
-
- EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Eigen::SyclDevice& device)
- : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
- {
- EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
-
- const typename TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
- const typename TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
-
- m_dimensions = m_inputImpl.dimensions();
- for (int i = 0; i < NumKernelDims; ++i) {
- const Index index = op.indices()[i];
- const Index input_dim = input_dims[index];
- const Index kernel_dim = kernel_dims[i];
- const Index result_dim = input_dim - kernel_dim + 1;
- m_dimensions[index] = result_dim;
- }
- }
-
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, const Eigen::SyclDevice>::type PacketReturnType;
- typedef typename InputArgType::Scalar Scalar;
- static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
-
- EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
- preloadKernel();
- m_inputImpl.evalSubExprsIfNeeded(NULL);
- if (data) {
- executeEval(data);
- return false;
- } else {
- m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
- executeEval(m_buf);
- return true;
- }
- }
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
- m_inputImpl.cleanup();
- if (m_buf) {
- m_device.deallocate(m_buf);
- m_buf = NULL;
- }
- if (m_local_kernel) {
- m_device.deallocate((void*)m_kernel);
- m_local_kernel = false;
- }
- m_kernel = NULL;
- }
- /// used by sycl in order to build the sycl buffer
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;}
- /// used by sycl in order to build the sycl buffer
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return m_buf; }
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
- // Don't make a local copy of the kernel unless we have to (i.e. it's an
- // expression that needs to be evaluated)
- const Scalar* in_place = m_kernelImpl.data();
- if (in_place) {
- m_kernel = in_place;
- m_local_kernel = false;
- } else {
- size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
- Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
- typedef TensorEvalToOp<const KernelArgType> EvalTo;
- EvalTo evalToTmp(local, m_kernelArg);
- const bool PacketAccess = internal::IsVectorizable<const Eigen::SyclDevice, KernelArgType>::value;
- internal::TensorExecutor<const EvalTo, const Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
- m_kernel = local;
- m_local_kernel = true;
- }
- }
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(Scalar* data) const {
- typedef TensorEvaluator<InputArgType, const Eigen::SyclDevice> InputEvaluator;
- typedef typename InputEvaluator::Dimensions InputDims;
-
- typedef Eigen::TensorSycl::internal::FunctorExtractor<InputEvaluator> InputFunctorExpr;
- // extract input functor list
- InputFunctorExpr input_functors = Eigen::TensorSycl::internal::extractFunctors(m_inputImpl);
-
-
- m_device.sycl_queue().submit([&](cl::sycl::handler &cgh) {
-
- typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> InputLocalAcc;
- /// work-around for gcc 4.8 auto bug
- typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl)) InputTupleType;
- // create input tuple of accessors
- InputTupleType tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl);
-
- typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> OutputAccessorType;
- OutputAccessorType out_res= m_device. template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, data);
- typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> KernelAccessorType;
- KernelAccessorType kernel_acc= m_device. template get_sycl_accessor<cl::sycl::access::mode::read>(cgh, m_kernel);
-
- switch (NumKernelDims) {
- case 1: {
- const size_t numX = dimensions()[m_indices[0]];
- const size_t numP = dimensions().TotalSize() / numX;
- const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
- size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y;
- m_device.parallel_for_setup(numX, numP, tileSize_x,tileSize_y,range_x,range_y, GRange_x, GRange_y );
- const size_t shared_mem =(tileSize_x +kernel_size -1)*(tileSize_y);
- assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
- auto global_range=cl::sycl::range<2>(GRange_x, GRange_y); // global range
- auto local_range=cl::sycl::range<2>(tileSize_x, tileSize_y); // local range
- InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
- const array<Index, 1> indices{{m_indices[0]}};
- const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
- internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
- cgh.parallel_for(cl::sycl::nd_range<2>(global_range, local_range),
- EigenConvolutionKernel1D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
- InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
- indexMapper,kernel_acc, kernel_size, numX, numP, out_res, local_acc, input_functors, tuple_of_accessors));
- break;
- }
-
- case 2: {
- const size_t idxX =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
- const size_t idxY =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
- const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX];
- const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY];
- const size_t numX = dimensions()[m_indices[idxX]];
- const size_t numY = dimensions()[m_indices[idxY]];
- const size_t numP = dimensions().TotalSize() / (numX*numY);
- size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z;
- m_device.parallel_for_setup(numX, numY, numP, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z );
- const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * tileSize_z;
- assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
- auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range
- auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range
- InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
- const array<Index, 2> indices {{m_indices[idxX], m_indices[idxY]}};
- const array<Index, 2> kernel_dims{{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY]}};
- internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
- cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range),
- EigenConvolutionKernel2D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
- InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
- indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, local_acc, input_functors, tuple_of_accessors));
- break;
- }
-
- case 3: {
- const size_t idxX =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
- const size_t idxY =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
- const size_t idxZ =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
- const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX];
- const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY];
- const size_t kernel_size_z = m_kernelImpl.dimensions()[idxZ];
- const size_t numX = dimensions()[m_indices[idxX]];
- const size_t numY = dimensions()[m_indices[idxY]];
- const size_t numZ = dimensions()[m_indices[idxZ]];
- const size_t numP = dimensions().TotalSize() / (numX*numY*numZ);
- const array<Index, 3> indices{{m_indices[idxX], m_indices[idxY], m_indices[idxZ]}};
- const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[idxX],m_kernelImpl.dimensions()[idxY], m_kernelImpl.dimensions()[idxZ]}};
- internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
- size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z;
- m_device.parallel_for_setup(numX, numY, numZ, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z );
- const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * (tileSize_z +kernel_size_y -1);
- assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
- auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range
- auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range
- InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
- cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range),
- EigenConvolutionKernel3D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
- InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
- indexMapper,kernel_acc, kernel_size_x, kernel_size_y, kernel_size_z, numX, numY,
- numZ, numP, out_res, local_acc, input_functors, tuple_of_accessors));
- break;
- }
-
- default: {
- EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
- }
- }
- });
- }
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
- {
- eigen_assert(m_buf);
- eigen_assert(index < m_dimensions.TotalSize());
- return m_buf[index];
- }
-
- template<int LoadMode>
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
- {
- eigen_assert(m_buf);
- eigen_assert(index < m_dimensions.TotalSize());
- return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
- }
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
- costPerCoeff(bool vectorized) const {
- // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
- // model.
- const double kernel_size = m_kernelImpl.dimensions().TotalSize();
- // We ignore the use of fused multiply-add.
- const double convolve_compute_cost =
- TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
- const double firstIndex_compute_cost =
- NumDims *
- (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
- TensorOpCost::DivCost<Index>());
- return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
- kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
- m_kernelImpl.costPerCoeff(vectorized) +
- TensorOpCost(0, 0, convolve_compute_cost, vectorized,
- PacketSize));
- }
-
- private:
- // No assignment (copies are needed by the kernels)
- TensorEvaluator& operator = (const TensorEvaluator&);
- TensorEvaluator<InputArgType, const Eigen::SyclDevice> m_inputImpl;
- KernelArgType m_kernelArg;
- TensorEvaluator<KernelArgType, const Eigen::SyclDevice> m_kernelImpl;
- Indices m_indices;
- Dimensions m_dimensions;
- Scalar* m_buf;
- const Scalar* m_kernel;
- bool m_local_kernel;
- const Eigen::SyclDevice& m_device;
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index be8d693..4f5767b 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -88,7 +88,7 @@ static void initializeDeviceProp() {
#if __cplusplus >= 201103L
std::atomic_thread_fence(std::memory_order_acquire);
#endif
- EIGEN_SLEEP(1000);
+ sleep(1);
}
}
}
@@ -217,10 +217,7 @@ struct GpuDevice {
EIGEN_UNUSED_VARIABLE(err)
assert(err == cudaSuccess);
#else
- EIGEN_UNUSED_VARIABLE(dst);
- EIGEN_UNUSED_VARIABLE(src);
- EIGEN_UNUSED_VARIABLE(n);
- eigen_assert(false && "The default device should be used instead to generate kernel code");
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
}
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
index ccaaa6c..9d14139 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
@@ -45,7 +45,7 @@ struct DefaultDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
-#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
+#ifndef __CUDA_ARCH__
// Running on the host CPU
return l1CacheSize();
#else
@@ -55,7 +55,7 @@ struct DefaultDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
-#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
+#ifndef __CUDA_ARCH__
// Running single threaded on the host CPU
return l3CacheSize();
#else
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
index e209799..7c03989 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -16,400 +16,107 @@
#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
namespace Eigen {
-
- #define ConvertToActualTypeSycl(Scalar, buf_acc) reinterpret_cast<typename cl::sycl::global_ptr<Scalar>::pointer_t>((&(*buf_acc.get_pointer())))
-
- template <typename Scalar, typename read_accessor, typename write_accessor> class MemCopyFunctor {
- public:
- MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset) : m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {}
-
- void operator()(cl::sycl::nd_item<1> itemID) {
- auto src_ptr = ConvertToActualTypeSycl(Scalar, m_src_acc);
- auto dst_ptr = ConvertToActualTypeSycl(Scalar, m_dst_acc);
- auto globalid = itemID.get_global_linear_id();
- if (globalid < m_rng) {
- dst_ptr[globalid + m_i] = src_ptr[globalid + m_offset];
- }
- }
-
- private:
- read_accessor m_src_acc;
- write_accessor m_dst_acc;
- size_t m_rng;
- size_t m_i;
- size_t m_offset;
- };
-
- struct memsetkernelFunctor{
- typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> AccType;
- AccType m_acc;
- const size_t m_rng, m_c;
- memsetkernelFunctor(AccType acc, const size_t rng, const size_t c):m_acc(acc), m_rng(rng), m_c(c){}
- void operator()(cl::sycl::nd_item<1> itemID) {
- auto globalid=itemID.get_global_linear_id();
- if (globalid< m_rng) m_acc[globalid] = m_c;
- }
-
- };
-
-EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){
- auto devices = cl::sycl::device::get_devices();
- std::vector<cl::sycl::device>::iterator it =devices.begin();
- while(it!=devices.end()) {
- /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU )
- auto s= (*it).template get_info<cl::sycl::info::device::vendor>();
- std::transform(s.begin(), s.end(), s.begin(), ::tolower);
- if((*it).is_cpu() && s.find("amd")!=std::string::npos && s.find("apu") == std::string::npos){ // remove amd cpu as it is not supported by computecpp allow APUs
- it=devices.erase(it);
- }
- else{
- ++it;
- }
- }
- return devices;
-}
-
-struct QueueInterface {
- /// class members:
- bool exception_caught_ = false;
-
- mutable std::mutex mutex_;
-
+struct SyclDevice {
+ /// class members
+ /// sycl queue
+ mutable cl::sycl::queue m_queue;
/// std::map is the container used to make sure that we create only one buffer
/// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice.
/// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it.
- mutable std::map<const uint8_t *, cl::sycl::buffer<uint8_t, 1>> buffer_map;
- /// sycl queue
- mutable cl::sycl::queue m_queue;
- /// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured through dev_Selector typename
- /// SyclStreamDevice is not owned. it is the caller's responsibility to destroy it.
- template<typename dev_Selector> explicit QueueInterface(const dev_Selector& s):
+ mutable std::map<const void *, std::shared_ptr<void>> buffer_map;
+ /// creating device by using selector
+ template<typename dev_Selector> SyclDevice(dev_Selector s)
+ :
#ifdef EIGEN_EXCEPTIONS
- m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
+ m_queue(cl::sycl::queue(s, [=](cl::sycl::exception_list l) {
for (const auto& e : l) {
try {
- if (e) {
- exception_caught_ = true;
- std::rethrow_exception(e);
- }
+ std::rethrow_exception(e);
} catch (cl::sycl::exception e) {
- std::cerr << e.what() << std::endl;
- }
+ std::cout << e.what() << std::endl;
+ }
}
}))
#else
-m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
- for (const auto& e : l) {
- if (e) {
- exception_caught_ = true;
- std::cerr << "Error detected Inside Sycl Device."<< std::endl;
-
- }
- }
-}))
+ m_queue(cl::sycl::queue(s))
#endif
{}
+ // destructor
+ ~SyclDevice() { deallocate_all(); }
- /// Allocating device pointer. This pointer is actually an 8 bytes host pointer used as key to access the sycl device buffer.
- /// The reason is that we cannot use device buffer as a pointer as a m_data in Eigen leafNode expressions. So we create a key
- /// pointer to be used in Eigen expression construction. When we convert the Eigen construction into the sycl construction we
- /// use this pointer as a key in our buffer_map and we make sure that we dedicate only one buffer only for this pointer.
- /// The device pointer would be deleted by calling deallocate function.
- EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
- auto buf = cl::sycl::buffer<uint8_t,1>(cl::sycl::range<1>(num_bytes));
- auto ptr =buf.get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>().get_pointer();
- buf.set_final_data(nullptr);
- std::lock_guard<std::mutex> lock(mutex_);
- buffer_map.insert(std::pair<const uint8_t *, cl::sycl::buffer<uint8_t, 1>>(static_cast<const uint8_t*>(ptr),buf));
- return static_cast<void*>(ptr);
- }
-
- /// This is used to deallocate the device pointer. p is used as a key inside
- /// the map to find the device buffer and delete it.
- EIGEN_STRONG_INLINE void deallocate(void *p) const {
- std::lock_guard<std::mutex> lock(mutex_);
- auto it = buffer_map.find(static_cast<const uint8_t*>(p));
+ template <typename T> void deallocate(T *p) const {
+ auto it = buffer_map.find(p);
if (it != buffer_map.end()) {
buffer_map.erase(it);
+ internal::aligned_free(p);
}
}
-
- EIGEN_STRONG_INLINE void deallocate_all() const {
- std::lock_guard<std::mutex> lock(mutex_);
- buffer_map.clear();
- }
-
- EIGEN_STRONG_INLINE std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1>>::iterator find_buffer(const void* ptr) const {
- std::lock_guard<std::mutex> lock(mutex_);
- auto it1 = buffer_map.find(static_cast<const uint8_t*>(ptr));
- if (it1 != buffer_map.end()){
- return it1;
- }
- else{
- for(std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1>>::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){
- auto size = it->second.get_size();
- if((it->first < (static_cast<const uint8_t*>(ptr))) && ((static_cast<const uint8_t*>(ptr)) < (it->first + size)) ) return it;
- }
- }
- std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl;
- abort();
- }
-
- // This function checks if the runtime recorded an error for the
- // underlying stream device.
- EIGEN_STRONG_INLINE bool ok() const {
- if (!exception_caught_) {
- m_queue.wait_and_throw();
+ void deallocate_all() const {
+ std::map<const void *, std::shared_ptr<void>>::iterator it=buffer_map.begin();
+ while (it!=buffer_map.end()) {
+ auto p=it->first;
+ buffer_map.erase(it);
+ internal::aligned_free(const_cast<void*>(p));
+ it=buffer_map.begin();
}
- return !exception_caught_;
+ buffer_map.clear();
}
- // destructor
- ~QueueInterface() { buffer_map.clear(); }
-};
-
-struct SyclDevice {
- // class member.
- QueueInterface* m_queue_stream;
- /// QueueInterface is not owned. it is the caller's responsibility to destroy it.
- explicit SyclDevice(QueueInterface* queue_stream) : m_queue_stream(queue_stream){}
-
- /// Creation of sycl accessor for a buffer. This function first tries to find
+ /// creation of sycl accessor for a buffer. This function first tries to find
/// the buffer in the buffer_map. If found it gets the accessor from it, if not,
- /// the function then adds an entry by creating a sycl buffer for that particular pointer.
- template <cl::sycl::access::mode AcMd> EIGEN_STRONG_INLINE cl::sycl::accessor<uint8_t, 1, AcMd, cl::sycl::access::target::global_buffer>
- get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const {
- return (get_sycl_buffer(ptr).template get_access<AcMd, cl::sycl::access::target::global_buffer>(cgh));
+ ///the function then adds an entry by creating a sycl buffer for that particular pointer.
+ template <cl::sycl::access::mode AcMd, typename T> inline cl::sycl::accessor<T, 1, AcMd, cl::sycl::access::target::global_buffer>
+ get_sycl_accessor(size_t num_bytes, cl::sycl::handler &cgh, const T * ptr) const {
+ return (get_sycl_buffer<T>(num_bytes, ptr)->template get_access<AcMd, cl::sycl::access::target::global_buffer>(cgh));
}
- /// Accessing the created sycl device buffer for the device pointer
- EIGEN_STRONG_INLINE cl::sycl::buffer<uint8_t, 1>& get_sycl_buffer(const void * ptr) const {
- return m_queue_stream->find_buffer(ptr)->second;
+ template<typename T> inline std::pair<std::map<const void *, std::shared_ptr<void>>::iterator,bool> add_sycl_buffer(const T *ptr, size_t num_bytes) const {
+ using Type = cl::sycl::buffer<T, 1>;
+ std::pair<std::map<const void *, std::shared_ptr<void>>::iterator,bool> ret = buffer_map.insert(std::pair<const void *, std::shared_ptr<void>>(ptr, std::shared_ptr<void>(new Type(cl::sycl::range<1>(num_bytes)),
+ [](void *dataMem) { delete static_cast<Type*>(dataMem); })));
+ (static_cast<Type*>(buffer_map.at(ptr).get()))->set_final_data(nullptr);
+ return ret;
}
- /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
- template<typename Index>
- EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const {
- tileSize =static_cast<Index>(sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>());
- auto s= sycl_queue().get_device().template get_info<cl::sycl::info::device::vendor>();
- std::transform(s.begin(), s.end(), s.begin(), ::tolower);
- if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
- tileSize=std::min(static_cast<Index>(256), static_cast<Index>(tileSize));
- }
- rng = n;
- if (rng==0) rng=static_cast<Index>(1);
- GRange=rng;
- if (tileSize>GRange) tileSize=GRange;
- else if(GRange>tileSize){
- Index xMode = static_cast<Index>(GRange % tileSize);
- if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
- }
- }
-
- /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
- template<typename Index>
- EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const {
- Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
- if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
- max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
- }
- Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
- tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2)));
- rng1=dim1;
- if (rng1==0 ) rng1=static_cast<Index>(1);
- GRange1=rng1;
- if (tileSize1>GRange1) tileSize1=GRange1;
- else if(GRange1>tileSize1){
- Index xMode = static_cast<Index>(GRange1 % tileSize1);
- if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
- }
- tileSize0 = static_cast<Index>(max_workgroup_Size/tileSize1);
- rng0 = dim0;
- if (rng0==0 ) rng0=static_cast<Index>(1);
- GRange0=rng0;
- if (tileSize0>GRange0) tileSize0=GRange0;
- else if(GRange0>tileSize0){
- Index xMode = static_cast<Index>(GRange0 % tileSize0);
- if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
- }
+ template <typename T> inline cl::sycl::buffer<T, 1>* get_sycl_buffer(size_t num_bytes,const T * ptr) const {
+ return static_cast<cl::sycl::buffer<T, 1>*>(add_sycl_buffer(ptr, num_bytes).first->second.get());
}
-
-
- /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
- template<typename Index>
- EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const {
- Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
- if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
- max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
- }
- Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
- tileSize2 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/3)));
- rng2=dim2;
- if (rng2==0 ) rng1=static_cast<Index>(1);
- GRange2=rng2;
- if (tileSize2>GRange2) tileSize2=GRange2;
- else if(GRange2>tileSize2){
- Index xMode = static_cast<Index>(GRange2 % tileSize2);
- if (xMode != 0) GRange2 += static_cast<Index>(tileSize2 - xMode);
- }
- pow_of_2 = static_cast<Index>(std::log2(static_cast<Index>(max_workgroup_Size/tileSize2)));
- tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2)));
- rng1=dim1;
- if (rng1==0 ) rng1=static_cast<Index>(1);
- GRange1=rng1;
- if (tileSize1>GRange1) tileSize1=GRange1;
- else if(GRange1>tileSize1){
- Index xMode = static_cast<Index>(GRange1 % tileSize1);
- if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
- }
- tileSize0 = static_cast<Index>(max_workgroup_Size/(tileSize1*tileSize2));
- rng0 = dim0;
- if (rng0==0 ) rng0=static_cast<Index>(1);
- GRange0=rng0;
- if (tileSize0>GRange0) tileSize0=GRange0;
- else if(GRange0>tileSize0){
- Index xMode = static_cast<Index>(GRange0 % tileSize0);
- if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
- }
- }
- /// allocate device memory
- EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
- return m_queue_stream->allocate(num_bytes);
+ /// allocating memory on the cpu
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void *allocate(size_t) const {
+ return internal::aligned_malloc(8);
}
- /// deallocate device memory
- EIGEN_STRONG_INLINE void deallocate(void *p) const {
- m_queue_stream->deallocate(p);
- }
// some runtime conditions that can be applied here
- EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; }
+ bool isDeviceSuitable() const { return true; }
- /// the memcpy function
- template<typename Index> EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
- auto it1 = m_queue_stream->find_buffer(static_cast<const void*>(src));
- auto it2 = m_queue_stream->find_buffer(dst);
- auto offset= (static_cast<const uint8_t*>(static_cast<const void*>(src))) - it1->first;
- auto i= (static_cast<const uint8_t*>(dst)) - it2->first;
- offset/=sizeof(Index);
- i/=sizeof(Index);
- size_t rng, GRange, tileSize;
- parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange);
- sycl_queue().submit([&](cl::sycl::handler &cgh) {
- auto src_acc =it1->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
- auto dst_acc =it2->second.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh);
- typedef decltype(src_acc) read_accessor;
- typedef decltype(dst_acc) write_accessor;
- cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, i, offset));
- });
- synchronize();
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
+ ::memcpy(dst, src, n);
}
- /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device
- /// pointer created as a key we find the sycl buffer and get the host accessor with discard_write mode
- /// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the
- /// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that
- /// this buffer is accessed, the data will be copied to the device.
- template<typename Index> EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const {
- auto host_acc= get_sycl_buffer(dst). template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>();
- ::memcpy(host_acc.get_pointer(), src, n);
+ template<typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(T *dst, const T *src, size_t n) const {
+ auto host_acc= (static_cast<cl::sycl::buffer<T, 1>*>(add_sycl_buffer(dst, n).first->second.get()))-> template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>();
+ memcpy(host_acc.get_pointer(), src, n);
}
- /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl
- /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the
- /// lifespan of the memcpyDeviceToHost function. We create a kernel to copy the data, from the device- only source buffer to the destination
- /// buffer with map_allocator on the gpu in parallel. At the end of the function call the destination buffer would be destroyed and the data
- /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back
- /// to the cpu only once per function call.
- template<typename Index> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const {
- auto it = m_queue_stream->find_buffer(src);
- auto offset =static_cast<const uint8_t*>(static_cast<const void*>(src))- it->first;
- offset/=sizeof(Index);
- size_t rng, GRange, tileSize;
- parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange);
- // Assuming that the dst is the start of the destination pointer
- auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(n));
- sycl_queue().submit([&](cl::sycl::handler &cgh) {
- auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
- auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
- typedef decltype(src_acc) read_accessor;
- typedef decltype(dst_acc) write_accessor;
- cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, 0, offset));
- });
- synchronize();
- }
- /// returning the sycl queue
- EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;}
- /// Here is the implementation of memset function on sycl.
- EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
- size_t rng, GRange, tileSize;
- parallel_for_setup(n, tileSize, rng, GRange);
- sycl_queue().submit(memsetCghFunctor(get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(data))),rng, GRange, tileSize, c ));
- synchronize();
- }
-
- struct memsetCghFunctor{
- cl::sycl::buffer<uint8_t, 1>& m_buf;
- const size_t& rng , GRange, tileSize;
- const int &c;
- memsetCghFunctor(cl::sycl::buffer<uint8_t, 1>& buff, const size_t& rng_, const size_t& GRange_, const size_t& tileSize_, const int& c_)
- :m_buf(buff), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){}
-
- void operator()(cl::sycl::handler &cgh) const {
- auto buf_acc = m_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
- cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, rng, c));
+ /// whith the current implementation of sycl, the data is copied twice from device to host. This will be fixed soon.
+ template<typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(T *dst, const T *src, size_t n) const {
+ auto it = buffer_map.find(src);
+ if (it != buffer_map.end()) {
+ auto host_acc= (static_cast<cl::sycl::buffer<T, 1>*>(it->second.get()))-> template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::host_buffer>();
+ memcpy(dst,host_acc.get_pointer(), n);
+ } else{
+ eigen_assert("no device memory found. The memory might be destroyed before creation");
}
- };
-
- EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
- // FIXME
- return 48*1024;
- }
-
- EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
- // We won't try to take advantage of the l2 cache for the time being, and
- // there is no l3 cache on cuda devices.
- return firstLevelCacheSize();
- }
- EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
- return sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_compute_units>();
- // return stream_->deviceProperties().multiProcessorCount;
}
- EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
- return sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>();
- // return stream_->deviceProperties().maxThreadsPerBlock;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void *buffer, int c, size_t n) const {
+ ::memset(buffer, c, n);
}
- EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
- // OpenCL doesnot have such concept
- return 2;//sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>();
- // return stream_->deviceProperties().maxThreadsPerMultiProcessor;
- }
- EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
- return sycl_queue().get_device(). template get_info<cl::sycl::info::device::local_mem_size>();
- // return stream_->deviceProperties().sharedMemPerBlock;
- }
- /// No need for sycl it should act the same as CPU version
- EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
-
- EIGEN_STRONG_INLINE void synchronize() const {
- sycl_queue().wait_and_throw(); //pass
- }
-
- EIGEN_STRONG_INLINE void asynchronousExec() const {
- ///FIXEDME:: currently there is a race condition regarding the asynch scheduler.
- //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled
- sycl_queue().wait_and_throw(); //pass
-
- }
- // This function checks if the runtime recorded an error for the
- // underlying stream device.
- EIGEN_STRONG_INLINE bool ok() const {
- return m_queue_stream->ok();
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+ return 1;
}
};
-
-
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index 16180ca..069680a 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -12,6 +12,17 @@
namespace Eigen {
+// Use the SimpleThreadPool by default. We'll switch to the new non blocking
+// thread pool later.
+#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
+template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
+typedef NonBlockingThreadPool ThreadPool;
+#else
+template <typename Env> using ThreadPoolTempl = SimpleThreadPoolTempl<Env>;
+typedef SimpleThreadPool ThreadPool;
+#endif
+
+
// Barrier is an object that allows one or more threads to wait until
// Notify has been called a specified number of times.
class Barrier {
@@ -245,7 +256,7 @@ struct ThreadPoolDevice {
// Split into halves and submit to the pool.
Index mid = first + divup((last - first) / 2, block_size) * block_size;
pool_->Schedule([=, &handleRange]() { handleRange(mid, last); });
- handleRange(first, mid);
+ pool_->Schedule([=, &handleRange]() { handleRange(first, mid); });
};
handleRange(0, n);
barrier.Wait();
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index 86405e6..b24cdeb 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -33,7 +33,7 @@ namespace Eigen {
namespace internal {
template<std::size_t n, typename Dimension> struct dget {
- static const std::ptrdiff_t value = get<n, Dimension>::value;
+ static const std::size_t value = get<n, Dimension>::value;
};
@@ -90,11 +90,9 @@ struct fixed_size_tensor_index_extraction_helper<Index, 0>
// Fixed size
#ifndef EIGEN_EMULATE_CXX11_META_H
template <typename std::ptrdiff_t... Indices>
-struct Sizes {
+struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> {
typedef internal::numeric_list<std::ptrdiff_t, Indices...> Base;
- const Base t = Base();
static const std::ptrdiff_t total_size = internal::arg_prod(Indices...);
- static const size_t count = Base::count;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const {
return Base::count;
@@ -122,16 +120,16 @@ struct Sizes {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::size_t index) const {
- return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, t);
+ return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, *this);
}
template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
- return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, t);
+ return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, *static_cast<const Base*>(this));
}
template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
- return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, t);
+ return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, *static_cast<const Base*>(this));
}
};
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index 82dd1e6..0698713 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -41,9 +41,6 @@ struct traits<TensorEvalToOp<XprType, MakePointer_> >
// Intermediate typedef to workaround MSVC issue.
typedef MakePointer_<T> MakePointerT;
typedef typename MakePointerT::Type Type;
- typedef typename MakePointerT::RefType RefType;
-
-
};
};
@@ -120,7 +117,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& op() const {
return m_op;
}
-
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {
}
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index d641581..834ce07 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -32,7 +32,6 @@ struct TensorEvaluator
typedef typename Derived::Scalar CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef typename Derived::Dimensions Dimensions;
- typedef Derived XprType;
// NumDimensions is -1 for variable dim tensors
static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
@@ -69,9 +68,7 @@ struct TensorEvaluator
return m_data[index];
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- typename internal::traits<Derived>::template MakePointer<Scalar>::RefType
- coeffRef(Index index) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
eigen_assert(m_data);
return m_data[index];
}
@@ -97,9 +94,7 @@ struct TensorEvaluator
}
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- typename internal::traits<Derived>::template MakePointer<Scalar>::RefType
- coeffRef(const array<DenseIndex, NumCoords>& coords) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<DenseIndex, NumCoords>& coords) {
eigen_assert(m_data);
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
return m_data[m_dims.IndexOfColMajor(coords)];
@@ -157,8 +152,6 @@ struct TensorEvaluator<const Derived, Device>
typedef typename Derived::Scalar CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef typename Derived::Dimensions Dimensions;
- typedef const Derived XprType;
-
// NumDimensions is -1 for variable dim tensors
static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index f060191..08eb559 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -253,7 +253,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
// get data into line_buf
const Index stride = m_strides[dim];
if (stride == 1) {
- m_device.memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar));
+ memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar));
} else {
Index offset = base_offset;
for (int j = 0; j < line_len; ++j, offset += stride) {
@@ -271,7 +271,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
// write back
if (FFTDir == FFT_FORWARD && stride == 1) {
- m_device.memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar));
+ memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar));
} else {
Index offset = base_offset;
const ComplexScalar div_factor = ComplexScalar(1.0 / line_len, 0);
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index abe85c8..bbd5eb3 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -26,8 +26,8 @@ namespace Eigen {
/// Therefore, by adding the default value, we managed to convert the type and it does not break any
/// existing code as its default value is T*.
namespace internal {
-template<typename XprType>
-struct traits<TensorForcedEvalOp<XprType> >
+template<typename XprType, template <class> class MakePointer_>
+struct traits<TensorForcedEvalOp<XprType, MakePointer_> >
{
// Type promotion to handle the case where the types of the lhs and the rhs are different.
typedef typename XprType::Scalar Scalar;
@@ -42,26 +42,31 @@ struct traits<TensorForcedEvalOp<XprType> >
enum {
Flags = 0
};
+ template <class T> struct MakePointer {
+ // Intermediate typedef to workaround MSVC issue.
+ typedef MakePointer_<T> MakePointerT;
+ typedef typename MakePointerT::Type Type;
+ };
};
-template<typename XprType>
-struct eval<TensorForcedEvalOp<XprType>, Eigen::Dense>
+template<typename XprType, template <class> class MakePointer_>
+struct eval<TensorForcedEvalOp<XprType, MakePointer_>, Eigen::Dense>
{
- typedef const TensorForcedEvalOp<XprType>& type;
+ typedef const TensorForcedEvalOp<XprType, MakePointer_>& type;
};
-template<typename XprType>
-struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<XprType> >::type>
+template<typename XprType, template <class> class MakePointer_>
+struct nested<TensorForcedEvalOp<XprType, MakePointer_>, 1, typename eval<TensorForcedEvalOp<XprType, MakePointer_> >::type>
{
- typedef TensorForcedEvalOp<XprType> type;
+ typedef TensorForcedEvalOp<XprType, MakePointer_> type;
};
} // end namespace internal
-template<typename XprType>
-class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOnlyAccessors>
+template<typename XprType, template <class> class MakePointer_>
+class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType, MakePointer_>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar;
@@ -83,10 +88,10 @@ class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOn
};
-template<typename ArgType, typename Device>
-struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
+template<typename ArgType, typename Device, template <class> class MakePointer_>
+struct TensorEvaluator<const TensorForcedEvalOp<ArgType, MakePointer_>, Device>
{
- typedef TensorForcedEvalOp<ArgType> XprType;
+ typedef TensorForcedEvalOp<ArgType, MakePointer_> XprType;
typedef typename ArgType::Scalar Scalar;
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
typedef typename XprType::Index Index;
@@ -102,7 +107,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
};
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
- /// op_ is used for sycl
+ /// op_ is used for sycl
: m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL)
{ }
@@ -143,17 +148,17 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return m_buffer; }
+ EIGEN_DEVICE_FUNC typename MakePointer<Scalar>::Type data() const { return m_buffer; }
/// required by sycl in order to extract the sycl accessor
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() { return m_impl; }
+ const TensorEvaluator<ArgType, Device>& impl() { return m_impl; }
/// used by sycl in order to build the sycl buffer
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;}
+ const Device& device() const{return m_device;}
private:
TensorEvaluator<ArgType, Device> m_impl;
const ArgType m_op;
const Device& m_device;
- CoeffReturnType* m_buffer;
+ typename MakePointer<CoeffReturnType>::Type m_buffer;
};
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index 2e63899..52b803d 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -20,19 +20,7 @@ namespace Eigen {
// map_allocator.
template<typename T> struct MakePointer {
typedef T* Type;
- typedef T& RefType;
};
-#if defined(EIGEN_USE_SYCL)
-namespace TensorSycl {
-namespace internal{
-template <typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor;
-template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Op, typename Dims, typename Index, typename TupleType>
-class FullReductionKernelFunctor;
-}
-}
-#endif
-
-
template<typename PlainObjectType, int Options_ = Unaligned, template <class> class MakePointer_ = MakePointer> class TensorMap;
template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType = DenseIndex> class Tensor;
@@ -75,7 +63,7 @@ template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp;
template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp;
template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorEvalToOp;
-template<typename XprType> class TensorForcedEvalOp;
+template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorForcedEvalOp;
template<typename ExpressionType, typename DeviceType> class TensorDevice;
template<typename Derived, typename Device> struct TensorEvaluator;
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index 3b4f8ed..d73f6dc 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -33,7 +33,7 @@ struct functor_traits<scalar_mod_op<Scalar> >
*/
template <typename Scalar>
struct scalar_mod2_op {
- EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op)
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op);
EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; }
};
template <typename Scalar>
@@ -42,7 +42,7 @@ struct functor_traits<scalar_mod2_op<Scalar> >
template <typename Scalar>
struct scalar_fmod_op {
- EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op)
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op);
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
operator()(const Scalar& a, const Scalar& b) const {
return numext::fmod(a, b);
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index ef1c9c4..ede3939 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -37,8 +37,6 @@ namespace {
{
#ifdef __CUDA_ARCH__
return __clz(val);
-#elif defined(__SYCL_DEVICE_ONLY__)
- return cl::sycl::clz(val);
#elif EIGEN_COMP_MSVC
unsigned long index;
_BitScanReverse(&index, val);
@@ -55,8 +53,6 @@ namespace {
{
#ifdef __CUDA_ARCH__
return __clzll(val);
-#elif defined(__SYCL_DEVICE_ONLY__)
- return cl::sycl::clz(val);
#elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64
unsigned long index;
_BitScanReverse64(&index, val);
@@ -92,8 +88,6 @@ namespace {
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
#if defined(__CUDA_ARCH__)
return __umulhi(a, b);
-#elif defined(__SYCL_DEVICE_ONLY__)
- return cl::sycl::mul_hi(a, static_cast<uint32_t>(b));
#else
return (static_cast<uint64_t>(a) * b) >> 32;
#endif
@@ -103,8 +97,6 @@ namespace {
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
#if defined(__CUDA_ARCH__)
return __umul64hi(a, b);
-#elif defined(__SYCL_DEVICE_ONLY__)
- return cl::sycl::mul_hi(a, static_cast<uint64_t>(b));
#elif defined(__SIZEOF_INT128__)
__uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b);
return static_cast<uint64_t>(v >> 64);
@@ -124,7 +116,7 @@ namespace {
template <typename T>
struct DividerHelper<64, T> {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
-#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
+#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__)
return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
#else
const uint64_t shift = 1ULL << log_div;
@@ -205,8 +197,6 @@ class TensorIntDivisor<int32_t, true> {
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
#ifdef __CUDA_ARCH__
return (__umulhi(magic, n) >> shift);
-#elif defined(__SYCL_DEVICE_ONLY__)
- return (cl::sycl::mul_hi(static_cast<uint64_t>(magic), static_cast<uint64_t>(n)) >> shift);
#else
uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n);
return (static_cast<uint32_t>(v >> 32) >> shift);
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
index f92e39d..ee0078b 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
@@ -51,12 +51,4 @@
#endif
-#if EIGEN_OS_WIN || EIGEN_OS_WIN64
-#define EIGEN_SLEEP(n) Sleep(n)
-#elif EIGEN_OS_GNULINUX
-#define EIGEN_SLEEP(n) usleep(n * 1000);
-#else
-#define EIGEN_SLEEP(n) sleep(std::max<unsigned>(1, n/1000))
-#endif
-
#endif
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index b5ef31d..615559d 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -75,7 +75,6 @@ struct PacketType<half, GpuDevice> {
HasSqrt = 1,
HasRsqrt = 1,
HasExp = 1,
- HasExpm1 = 0,
HasLog = 1,
HasLog1p = 0,
HasLog10 = 0,
@@ -169,12 +168,12 @@ template <typename Idx> struct IndexPair {
#ifdef EIGEN_HAS_SFINAE
namespace internal {
- template<typename IndexType, typename Index, Index... Is>
+ template<typename IndexType, Index... Is>
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
array<Index, sizeof...(Is)> customIndices2Array(IndexType& idx, numeric_list<Index, Is...>) {
return { idx[Is]... };
}
- template<typename IndexType, typename Index>
+ template<typename IndexType>
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
array<Index, 0> customIndices2Array(IndexType&, numeric_list<Index>) {
return array<Index, 0>();
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 6ddd2ca..d34f1e3 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -299,16 +299,6 @@ template <typename Index> struct MemcpyTriggerForSlicing<Index, GpuDevice> {
EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; }
};
#endif
-
-// It is very expensive to start the memcpy kernel on GPU: we therefore only
-// use it for large copies.
-#ifdef EIGEN_USE_SYCL
-template <typename Index> struct MemcpyTriggerForSlicing<Index, const Eigen::SyclDevice> {
- EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) { }
- EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; }
-};
-#endif
-
}
// Eval as rvalue
@@ -503,14 +493,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
}
return NULL;
}
- /// used by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const{
- return m_impl;
- }
- /// used by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& startIndices() const{
- return m_offsets;
- }
+
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
{
@@ -711,12 +694,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
{
typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
static const int NumDims = internal::array_size<Strides>::value;
- typedef typename XprType::Index Index;
- typedef typename XprType::Scalar Scalar;
- typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
- typedef Strides Dimensions;
enum {
// Alignment can't be guaranteed at compile time since it depends on the
@@ -729,7 +706,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
- : m_impl(op.expression(), device), m_device(device), m_strides(op.strides()), m_exprStartIndices(op.startIndices()), m_exprStopIndices(op.stopIndices())
+ : m_impl(op.expression(), device), m_device(device), m_strides(op.strides())
{
// Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero
DSizes<Index,NumDims> startIndicesClamped, stopIndicesClamped;
@@ -739,7 +716,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]);
}else{
- /* implies m_strides[i]<0 by assert */
+ /* implies m_strides[i]<0 by assert */
startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1);
stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1);
}
@@ -802,6 +779,13 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
sizeof(Scalar));
}
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef Strides Dimensions;
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -827,15 +811,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
return NULL;
}
- //use by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& exprStartIndices() const { return m_exprStartIndices; }
- //use by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& exprStopIndices() const { return m_exprStopIndices; }
- //use by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& strides() const { return m_strides; }
- /// used by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const{return m_impl;}
-
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
{
@@ -857,11 +832,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
}
static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
-#ifndef __SYCL_DEVICE_ONLY__
return numext::maxi(min, numext::mini(max,value));
-#else
- return cl::sycl::clamp(value, min, max);
-#endif
}
array<Index, NumDims> m_outputStrides;
@@ -874,10 +845,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
DSizes<Index, NumDims> m_offsets; // offset in a flattened shape
const Strides m_strides;
std::size_t m_block_total_size_max;
- //use by sycl
- const StartIndices m_exprStartIndices;
- //use by sycl
- const StopIndices m_exprStopIndices;
};
// Eval as lvalue
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index a8e2552..647bcf1 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -200,13 +200,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
- /// used by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PaddingDimensions& padding() const { return m_padding; }
- /// used by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& padding_value() const { return m_paddingValue; }
- /// used by sycl
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const{return m_impl;}
-
private:
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim(
Index index, int dim_index) const {
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index e341e2e..41d0d00 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -11,20 +11,8 @@
#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
-// clang is incompatible with the CUDA syntax wrt making a kernel a class friend,
-// so we'll use a macro to make clang happy.
-#ifndef KERNEL_FRIEND
-#if defined(__clang__) && defined(__CUDA__)
-#define KERNEL_FRIEND friend __global__
-#else
-#define KERNEL_FRIEND friend
-#endif
-#endif
-
-
namespace Eigen {
-
/** \class TensorReduction
* \ingroup CXX11_Tensor_Module
*
@@ -692,23 +680,17 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
#endif
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
- template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
+ template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
#ifdef EIGEN_HAS_CUDA_FP16
- template <typename S, typename R, typename I> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
- template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
- template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
-#endif
- template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
-
- template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+ template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
+ template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
+ template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
#endif
+ template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
-#if defined(EIGEN_USE_SYCL)
- template < typename HostExpr_, typename FunctorExpr_, typename Tuple_of_Acc_, typename Dims_, typename Op_, typename Index_> friend class TensorSycl::internal::ReductionFunctor;
- template<typename CoeffReturnType_ ,typename OutAccessor_, typename HostExpr_, typename FunctorExpr_, typename Op_, typename Dims_, typename Index_, typename TupleType_> friend class TensorSycl::internal::FullReductionKernelFunctor;
+ template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
#endif
-
template <typename S, typename O, typename D> friend struct internal::InnerReducer;
// Returns the Index in the input tensor of the first value that needs to be
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index edb0ab2..65638b6 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -287,6 +287,7 @@ struct FullReductionLauncher<
void>::type> {
static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
typedef typename Self::Index Index;
+ typedef typename Self::CoeffReturnType Scalar;
const int block_size = 256;
const int num_per_thread = 128;
const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
index c3ca129..3daecb0 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@@ -25,28 +25,61 @@
namespace Eigen {
namespace internal {
-template<typename OP, typename CoeffReturnType> struct syclGenericBufferReducer{
+template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{
template<typename BufferTOut, typename BufferTIn>
-static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
+static void run(BufferTOut* bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
do {
- auto f = [length, local, op, &bufOut, &bufI](cl::sycl::handler& h) mutable {
+ auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable {
cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
cl::sycl::range<1>{std::min(length, local)}};
/* Two accessors are used: one to the buffer that is being reduced,
* and a second to local memory, used to store intermediate data. */
- auto aI =bufI.template get_access<cl::sycl::access::mode::read_write>(h);
- auto aOut =bufOut.template get_access<cl::sycl::access::mode::discard_write>(h);
- typedef decltype(aI) InputAccessor;
- typedef decltype(aOut) OutputAccessor;
- typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,cl::sycl::access::target::local> LocalAccessor;
- LocalAccessor scratch(cl::sycl::range<1>(local), h);
+ auto aI =
+ bufI.template get_access<cl::sycl::access::mode::read_write>(h);
+ auto aOut =
+ bufOut->template get_access<cl::sycl::access::mode::discard_write>(h);
+ cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,
+ cl::sycl::access::target::local>
+ scratch(cl::sycl::range<1>(local), h);
/* The parallel_for invocation chosen is the variant with an nd_item
* parameter, since the code requires barriers for correctness. */
- h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, aI, scratch, length, local));
+ h.parallel_for<KernelName>(
+ r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) {
+ size_t globalid = id.get_global(0);
+ size_t localid = id.get_local(0);
+ /* All threads collectively read from global memory into local.
+ * The barrier ensures all threads' IO is resolved before
+ * execution continues (strictly speaking, all threads within
+ * a single work-group - there is no co-ordination between
+ * work-groups, only work-items). */
+ if (globalid < length) {
+ scratch[localid] = aI[globalid];
+ }
+ id.barrier(cl::sycl::access::fence_space::local_space);
+
+ /* Apply the reduction operation between the current local
+ * id and the one on the other half of the vector. */
+ if (globalid < length) {
+ int min = (length < local) ? length : local;
+ for (size_t offset = min / 2; offset > 0; offset /= 2) {
+ if (localid < offset) {
+ scratch[localid] += scratch[localid + offset];
+ }
+ id.barrier(cl::sycl::access::fence_space::local_space);
+ }
+ /* The final result will be stored in local id 0. */
+ if (localid == 0) {
+ aI[id.get_group(0)] = scratch[localid];
+ if((length<=local) && globalid ==0){
+ aOut[globalid]=scratch[localid];
+ }
+ }
+ }
+ });
};
- dev.sycl_queue().submit(f);
- dev.asynchronousExec();
+ dev.m_queue.submit(f);
+ dev.m_queue.throw_asynchronous();
/* At this point, you could queue::wait_and_throw() to ensure that
* errors are caught quickly. However, this would likely impact
@@ -54,23 +87,18 @@ static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDev
length = length / local;
} while (length > 1);
-}
-};
-template<typename CoeffReturnType> struct syclGenericBufferReducer<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType>{
-template<typename BufferTOut, typename BufferTIn>
-static void run(Eigen::internal::MeanReducer<CoeffReturnType>, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
- syclGenericBufferReducer<Eigen::internal::SumReducer<CoeffReturnType>, CoeffReturnType>::run(Eigen::internal::SumReducer<CoeffReturnType>(),
- bufOut, bufI, dev, length, local);
+
}
+
};
+/// For now let's start with a full reducer
/// Self is useless here because in expression construction we are going to treat reduction as a leafnode.
/// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the
/// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as
// a leafNode.
-
template <typename Self, typename Op, bool Vectorizable>
struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
@@ -79,8 +107,8 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
- typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr;
- FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl());
+ typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
+ auto functors = TensorSycl::internal::extractFunctors(self.impl());
int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread.
size_t inputSize =self.impl().dimensions().TotalSize();
size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input
@@ -88,7 +116,7 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
if(rng ==0) {
red_factor=1;
};
- size_t tileSize =dev.sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
+ size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
size_t GRange=std::max((size_t )1, rng);
// convert global range to power of 2 for redecution
@@ -105,66 +133,105 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
size_t outTileSize = tileSize;
/// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one.
if (GRange < outTileSize) outTileSize=GRange;
+ // getting final out buffer at the moment the created buffer is true because there is no need for assign
+ auto out_buffer =dev.template get_sycl_buffer<typename Eigen::internal::remove_all<CoeffReturnType>::type>(self.dimensions().TotalSize(), output);
/// creating the shared memory for calculating reduction.
/// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
/// recursively apply reduction on it in order to reduce the whole.
auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
- // Dims dims= self.xprDims();
- //Op functor = reducer;
- dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
- // this is a workaround for gcc 4.8 bug
- typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) TupleType;
+ Dims dims= self.xprDims();
+ Op functor = reducer;
+ dev.m_queue.submit([&](cl::sycl::handler &cgh) {
// create a tuple of accessors from Evaluator
- TupleType tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
+ auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
- typedef decltype(tmp_global_accessor) OutAccessor;
- cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)),
- TensorSycl::internal::FullReductionKernelFunctor<CoeffReturnType, OutAccessor, HostExpr, FunctorExpr, Op, Dims, size_t, TupleType>
- (tmp_global_accessor, rng, remaining, red_factor, reducer, self.xprDims(), functors, tuple_of_accessors));
+
+ cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) {
+ typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
+ auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+ /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
+ /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
+ /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
+ const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
+ /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
+ /// the device_evaluator is detectable and recognisable on the device.
+ auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
+ /// const cast added as a naive solution to solve the qualifier drop error
+ auto globalid=itemID.get_global_linear_id();
+
+ if(globalid<rng)
+ tmp_global_accessor.get_pointer()[globalid]=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*globalid, red_factor, const_cast<Op&>(functor));
+ else
+ tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);
+
+ if(remaining!=0 && globalid==0 )
+ // this will add the rest of input buffer when the input size is not devidable to red_factor.
+ tmp_global_accessor.get_pointer()[globalid]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*(rng), remaining, const_cast<Op&>(functor));
+ });
});
- dev.asynchronousExec();
+ dev.m_queue.throw_asynchronous();
- // getting final out buffer at the moment the created buffer is true because there is no need for assign
- auto out_buffer =dev.get_sycl_buffer(output);
- /// This is used to recursively reduce the tmp value to an element of 1;
- syclGenericBufferReducer<Op, CoeffReturnType>::run(reducer, out_buffer, temp_global_buffer,dev, GRange, outTileSize);
+/// This is used to recursively reduce the tmp value to an element of 1;
+ syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize);
}
};
-
template <typename Self, typename Op>
struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
typedef typename Self::CoeffReturnType CoeffReturnType;
static const bool HasOptimizedImplementation = false;
- static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index num_values_to_reduce, typename Self::Index num_coeffs_to_preserve) {
+ static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) {
typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
- typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr;
- FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl());
- typename Self::Index range, GRange, tileSize;
- typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
+ typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
+ auto functors = TensorSycl::internal::extractFunctors(self.impl());
+
+ size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
+ size_t GRange=num_coeffs_to_preserve;
+ if (tileSize>GRange) tileSize=GRange;
+ else if(GRange>tileSize){
+ size_t xMode = GRange % tileSize;
+ if (xMode != 0) GRange += (tileSize - xMode);
+ }
// getting final out buffer at the moment the created buffer is true because there is no need for assign
/// creating the shared memory for calculating reduction.
/// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
/// recursively apply reduction on it in order to reduce the whole.
- dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
- dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
- // this is workaround for gcc 4.8 bug.
- typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc;
- // create a tuple of accessors from Evaluator
- Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
- auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output);
- Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast<Index>(1);
- cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
- TensorSycl::internal::ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index>
- (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size));
+ typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
+ Dims dims= self.xprDims();
+ Op functor = reducer;
+ dev.m_queue.submit([&](cl::sycl::handler &cgh) {
+ // create a tuple of accessors from Evaluator
+ auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
+ auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(num_coeffs_to_preserve,cgh, output);
+
+ cgh.parallel_for<Self>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
+ typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
+ auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+ /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
+ /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
+ /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
+ const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
+ /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
+ /// the device_evaluator is detectable and recognisable on the device.
+ typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeiceSelf;
+ auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
+ /// const cast added as a naive solution to solve the qualifier drop error
+ auto globalid=itemID.get_global_linear_id();
+ if (globalid< static_cast<size_t>(num_coeffs_to_preserve)) {
+ typename DeiceSelf::CoeffReturnType accum = functor.initialize();
+ GenericDimReducer<DeiceSelf::NumReducedDims-1, DeiceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(globalid),const_cast<Op&>(functor), &accum);
+ functor.finalize(accum);
+ output_accessor.get_pointer()[globalid]= accum;
+ }
+ });
});
- dev.asynchronousExec();
+ dev.m_queue.throw_asynchronous();
return false;
}
};
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index e430b08..14e392e 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -224,11 +224,6 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
- /// required by sycl in order to extract the accessor
- const TensorEvaluator<ArgType, Device> & impl() const { return m_impl; }
- /// added for sycl in order to construct the buffer from sycl device
- ReverseDimensions functor() const { return m_reverse; }
-
protected:
Dimensions m_dimensions;
array<Index, NumDims> m_strides;
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index edc9dd3..113c060 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -117,7 +117,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
- : m_impl(op.expression(), device), m_shuffle(op.shufflePermutation())
+ : m_impl(op.expression(), device)
{
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
const Shuffle& shuffle = op.shufflePermutation();
@@ -187,11 +187,6 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
- // required by sycl
- EIGEN_STRONG_INLINE const Shuffle& shufflePermutation() const {return m_shuffle;}
- // required by sycl
- EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const {return m_impl;}
-
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
Index inputIndex = 0;
@@ -211,12 +206,11 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
return inputIndex + index * m_inputStrides[NumDims - 1];
}
}
+
Dimensions m_dimensions;
array<Index, NumDims> m_outputStrides;
array<Index, NumDims> m_inputStrides;
TensorEvaluator<ArgType, Device> m_impl;
- /// required by sycl
- Shuffle m_shuffle;
};
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
index e6a666f..2854a4a 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@@ -31,12 +31,12 @@ namespace Eigen {
*
* \sa Tensor
*/
-template<typename T, typename Dimensions, int Options> class TensorStorage;
+template<typename T, typename Dimensions, int Options_> class TensorStorage;
// Pure fixed-size storage
-template<typename T, typename FixedDimensions, int Options_>
-class TensorStorage
+template<typename T, int Options_, typename FixedDimensions>
+class TensorStorage<T, FixedDimensions, Options_>
{
private:
static const std::size_t Size = FixedDimensions::total_size;
@@ -66,7 +66,7 @@ class TensorStorage
// pure dynamic
-template<typename T, typename IndexType, int NumIndices_, int Options_>
+template<typename T, int Options_, typename IndexType, int NumIndices_>
class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
{
public:
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 2237140..6c35bfd 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -117,11 +117,11 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
- : m_impl(op.expression(), device), m_strides(op.strides())
+ : m_impl(op.expression(), device)
{
m_dimensions = m_impl.dimensions();
for (int i = 0; i < NumDims; ++i) {
- m_dimensions[i] =Eigen::numext::ceil(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
+ m_dimensions[i] = ceilf(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
}
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
@@ -224,11 +224,6 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
- /// required by sycl in order to extract the accessor
- const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
- /// required by sycl in order to extract the accessor
- Strides functor() const { return m_strides; }
-
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
{
@@ -255,9 +250,9 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
array<Index, NumDims> m_outputStrides;
array<Index, NumDims> m_inputStrides;
TensorEvaluator<ArgType, Device> m_impl;
- const Strides m_strides;
};
+
// Eval as lvalue
template<typename Strides, typename ArgType, typename Device>
struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
@@ -291,11 +286,6 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
return this->m_impl.coeffRef(this->srcCoeff(index));
}
- /// required by sycl in order to extract the accessor
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return this->m_impl; }
- /// required by sycl in order to extract the accessor
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Strides functor() const { return this->m_strides; }
-
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
{
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
index 9d5a6d4..bb8800d 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
@@ -20,14 +20,12 @@
template <class T>
struct MakeGlobalPointer {
typedef typename cl::sycl::global_ptr<T>::pointer_t Type;
- typedef typename cl::sycl::global_ptr<T>::reference_t RefType;
};
// global pointer to set different attribute state for a class
template <class T>
struct MakeLocalPointer {
typedef typename cl::sycl::local_ptr<T>::pointer_t Type;
- typedef typename cl::sycl::local_ptr<T>::reference_t RefType;
};
@@ -35,9 +33,6 @@ namespace Eigen {
namespace TensorSycl {
namespace internal {
- template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer;
-
-
/// This struct is used for special expression nodes with no operations (for example assign and selectOP).
struct NoOP;
@@ -80,15 +75,8 @@ template<typename T> struct GetType<false, T>{
/// this is used for extracting tensor reduction
#include "TensorReductionSycl.h"
-/// this is used for extracting tensor convolution
-#include "TensorConvolutionSycl.h"
-
// kernel execution using fusion
#include "TensorSyclRun.h"
-//sycl functors
-#include "TensorSyclFunctors.h"
-
-#include "TensorContractionSycl.h"
#endif // end of EIGEN_USE_SYCL
#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
index ee8f3c9..8729c86 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
@@ -48,9 +48,9 @@ struct DeviceConvertor{
/// specialisation of the \ref ConvertToDeviceExpression struct when the node
/// type is TensorMap
#define TENSORMAPCONVERT(CVQual)\
-template <typename T, int Options_, template <class> class MakePointer_>\
-struct ConvertToDeviceExpression<CVQual TensorMap<T, Options_, MakePointer_> > {\
- typedef CVQual TensorMap<T, Options_, MakeGlobalPointer> Type;\
+template <typename Scalar_, int Options_, int Options2_, int NumIndices_, typename IndexType_, template <class> class MakePointer_>\
+struct ConvertToDeviceExpression<CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakePointer_> > {\
+ typedef CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakeGlobalPointer> Type;\
};
TENSORMAPCONVERT(const)
@@ -97,18 +97,8 @@ template <typename Expr>\
struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > \
: DeviceConvertor<ExprNode, Res, Expr>{};
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorForcedEvalOp
-#define KERNELBROKERCONVERTFORCEDEVAL(CVQual)\
-template <typename Expr>\
-struct ConvertToDeviceExpression<CVQual TensorForcedEvalOp<Expr> > {\
- typedef CVQual TensorForcedEvalOp< typename ConvertToDeviceExpression<Expr>::Type> Type;\
-};
-KERNELBROKERCONVERTFORCEDEVAL(const)
-KERNELBROKERCONVERTFORCEDEVAL()
-#undef KERNELBROKERCONVERTFORCEDEVAL
-
-
-
+KERNELBROKERCONVERT(const, true, TensorForcedEvalOp)
+KERNELBROKERCONVERT(, false, TensorForcedEvalOp)
KERNELBROKERCONVERT(const, true, TensorEvalToOp)
KERNELBROKERCONVERT(, false, TensorEvalToOp)
#undef KERNELBROKERCONVERT
@@ -124,40 +114,6 @@ KERNELBROKERCONVERTREDUCTION(const)
KERNELBROKERCONVERTREDUCTION()
#undef KERNELBROKERCONVERTREDUCTION
-#define KERNELBROKERCONVERTSLICEOP(CVQual)\
-template<typename StartIndices, typename Sizes, typename XprType>\
-struct ConvertToDeviceExpression<CVQual TensorSlicingOp <StartIndices, Sizes, XprType> >{\
- typedef CVQual TensorSlicingOp<StartIndices, Sizes, typename ConvertToDeviceExpression<XprType>::Type> Type;\
-};
-
-KERNELBROKERCONVERTSLICEOP(const)
-KERNELBROKERCONVERTSLICEOP()
-#undef KERNELBROKERCONVERTSLICEOP
-
-
-#define KERNELBROKERCONVERTERSLICESTRIDEOP(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>\
-struct ConvertToDeviceExpression<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >{\
- typedef CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, typename ConvertToDeviceExpression<XprType>::Type> Type;\
-};
-
-KERNELBROKERCONVERTERSLICESTRIDEOP(const)
-KERNELBROKERCONVERTERSLICESTRIDEOP()
-#undef KERNELBROKERCONVERTERSLICESTRIDEOP
-
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorChippingOp
-#define KERNELBROKERCONVERTCHIPPINGOP(CVQual)\
-template <DenseIndex DimId, typename Expr>\
-struct ConvertToDeviceExpression<CVQual TensorChippingOp<DimId, Expr> > {\
- typedef CVQual TensorChippingOp<DimId, typename ConvertToDeviceExpression<Expr>::Type> Type;\
-};
-KERNELBROKERCONVERTCHIPPINGOP(const)
-KERNELBROKERCONVERTCHIPPINGOP()
-#undef KERNELBROKERCONVERTCHIPPINGOP
-
-
-
} // namespace internal
} // namespace TensorSycl
} // namespace Eigen
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
index 3b83b1d..7ed3a3a 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
@@ -25,21 +25,12 @@
namespace Eigen {
namespace TensorSycl {
namespace internal {
-
-template <typename Expr, typename Dims>
-struct DeviceFixedSizeTensor;
-
-template <typename Expr, typename std::ptrdiff_t... Indices>
-struct DeviceFixedSizeTensor<Expr, Eigen::Sizes<Indices...>>{
- template<typename Data>
- static EIGEN_ALWAYS_INLINE Expr instantiate(Data& dt) {return Expr(ConvertToActualTypeSycl(typename Expr::Scalar, dt), Indices...);}
-};
/// this class is used by EvalToOp in order to create an lhs expression which is
/// a pointer from an accessor on device-only buffer
template <typename PtrType, size_t N, typename... Params>
struct EvalToLHSConstructor {
PtrType expr;
- EvalToLHSConstructor(const utility::tuple::Tuple<Params...> &t) : expr(ConvertToActualTypeSycl(typename Eigen::internal::remove_all<PtrType>::type, utility::tuple::get<N>(t))) {}
+ EvalToLHSConstructor(const utility::tuple::Tuple<Params...> &t): expr((&(*(utility::tuple::get<N>(t).get_pointer())))) {}
};
/// \struct ExprConstructor is used to reconstruct the expression on the device and
@@ -54,39 +45,21 @@ struct ExprConstructor;
/// specialisation of the \ref ExprConstructor struct when the node type is
/// TensorMap
#define TENSORMAP(CVQual)\
-template <typename T, int Options_,\
+template <typename Scalar_, int Options_, int Options2_, int Options3_, int NumIndices_, typename IndexType_,\
template <class> class MakePointer_, size_t N, typename... Params>\
-struct ExprConstructor< CVQual TensorMap<T, Options_, MakeGlobalPointer>,\
-CVQual PlaceHolder<CVQual TensorMap<T, Options_, MakePointer_>, N>, Params...>{\
- typedef CVQual TensorMap<T, Options_, MakeGlobalPointer> Type;\
+struct ExprConstructor< CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakeGlobalPointer>,\
+CVQual PlaceHolder<CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options3_, MakePointer_>, N>, Params...>{\
+ typedef CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakeGlobalPointer> Type;\
Type expr;\
template <typename FuncDetector>\
ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
- : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())){}\
+ : expr(Type((&(*(utility::tuple::get<N>(t).get_pointer()))), fd.dimensions())) {}\
};
-
TENSORMAP(const)
TENSORMAP()
#undef TENSORMAP
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorMap
-#define TENSORMAPFIXEDSIZE(CVQual)\
-template <typename Scalar_, typename Dimensions_, int Options_2, typename IndexType, int Options_,\
-template <class> class MakePointer_, size_t N, typename... Params>\
-struct ExprConstructor< CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakeGlobalPointer>,\
-CVQual PlaceHolder<CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_>, N>, Params...>{\
- typedef CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakeGlobalPointer> Type;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &, const utility::tuple::Tuple<Params...> &t)\
- : expr(DeviceFixedSizeTensor<Type,Dimensions_>::instantiate(utility::tuple::get<N>(t))){}\
-};
-TENSORMAPFIXEDSIZE(const)
-TENSORMAPFIXEDSIZE()
-#undef TENSORMAPFIXEDSIZE
-
#define UNARYCATEGORY(CVQual)\
template <template<class, class> class UnaryCategory, typename OP, typename OrigRHSExpr, typename RHSExpr, typename... Params>\
struct ExprConstructor<CVQual UnaryCategory<OP, OrigRHSExpr>, CVQual UnaryCategory<OP, RHSExpr>, Params...> {\
@@ -188,30 +161,8 @@ struct ExprConstructor<CVQual TensorAssignOp<OrigLHSExpr, OrigRHSExpr>, CVQual
ASSIGN(const)
ASSIGN()
#undef ASSIGN
-
-
-
-
- /// specialisation of the \ref ExprConstructor struct when the node type is
- /// const TensorAssignOp
- #define CONVERSIONEXPRCONST(CVQual)\
- template <typename OrigNestedExpr, typename ConvertType, typename NestedExpr, typename... Params>\
- struct ExprConstructor<CVQual TensorConversionOp<ConvertType, OrigNestedExpr>, CVQual TensorConversionOp<ConvertType, NestedExpr>, Params...> {\
- typedef ExprConstructor<OrigNestedExpr, NestedExpr, Params...> my_nested_type;\
- typedef CVQual TensorConversionOp<ConvertType, typename my_nested_type::Type> Type;\
- my_nested_type nestedExpr;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
- : nestedExpr(funcD.subExpr, t), expr(nestedExpr.expr) {}\
- };
-
- CONVERSIONEXPRCONST(const)
- CONVERSIONEXPRCONST()
- #undef CONVERSIONEXPRCONST
-
/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorEvalToOp /// 0 here is the output number in the buffer
+/// TensorEvalToOp
#define EVALTO(CVQual)\
template <typename OrigExpr, typename Expr, typename... Params>\
struct ExprConstructor<CVQual TensorEvalToOp<OrigExpr, MakeGlobalPointer>, CVQual TensorEvalToOp<Expr>, Params...> {\
@@ -234,14 +185,14 @@ EVALTO()
/// TensorForcedEvalOp
#define FORCEDEVAL(CVQual)\
template <typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
-struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr>,\
+struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr, MakeGlobalPointer>,\
CVQual PlaceHolder<CVQual TensorForcedEvalOp<DevExpr>, N>, Params...> {\
- typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr>::Scalar,\
- TensorForcedEvalOp<DevExpr>::NumDimensions, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, typename TensorForcedEvalOp<DevExpr>::Index>, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, MakeGlobalPointer> Type;\
+ typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::Scalar,\
+ TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::NumDimensions, 0, typename TensorForcedEvalOp<DevExpr>::Index>, 0, MakeGlobalPointer> Type;\
Type expr;\
template <typename FuncDetector>\
ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
- : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
+ : expr(Type((&(*(utility::tuple::get<N>(t).get_pointer()))), fd.dimensions())) {}\
};
FORCEDEVAL(const)
@@ -262,130 +213,17 @@ struct ExprConstructor<CVQual TensorReductionOp<OP, Dim, OrigExpr, MakeGlobalPoi
CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dim, DevExpr>, N>, Params...> {\
static const size_t NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0, 1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\
typedef CVQual TensorMap<Tensor<typename TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::Scalar,\
- NumIndices, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, typename TensorReductionOp<OP, Dim, DevExpr>::Index>, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, MakeGlobalPointer> Type;\
+ NumIndices, 0, typename TensorReductionOp<OP, Dim, DevExpr>::Index>, 0, MakeGlobalPointer> Type;\
Type expr;\
template <typename FuncDetector>\
ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
- :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
+ : expr(Type((&(*(utility::tuple::get<N>(t).get_pointer()))), fd.dimensions())) {}\
};
SYCLREDUCTIONEXPR(const)
SYCLREDUCTIONEXPR()
#undef SYCLREDUCTIONEXPR
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorContractionOp
-#define SYCLCONTRACTIONCONVOLUTION(CVQual, ExprNode)\
-template <typename Indices, typename OrigLhsXprType, typename OrigRhsXprType, typename LhsXprType, typename RhsXprType, size_t N, typename... Params>\
-struct ExprConstructor<CVQual ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>,\
-CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>, Params...> {\
- static const size_t NumIndices= Eigen::internal::traits<ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> >::NumDimensions;\
- typedef CVQual TensorMap<Tensor<typename ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>::Scalar,\
- NumIndices, Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType> >::Layout,\
- typename ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>::Index>,\
- Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>>::Layout, MakeGlobalPointer> Type;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
- :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
-};
-
-SYCLCONTRACTIONCONVOLUTION(const, TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTION(, TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTION(const, TensorConvolutionOp)
-SYCLCONTRACTIONCONVOLUTION(, TensorConvolutionOp)
-#undef SYCLCONTRACTIONCONVOLUTION
-
-
-
-#define SYCLSLICEOPEXPR(CVQual)\
-template<typename StartIndices, typename Sizes, typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual TensorSlicingOp <StartIndices, Sizes, OrigXprType> , CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Params... >{\
- typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
- typedef CVQual TensorSlicingOp<StartIndices, Sizes, typename my_xpr_type::Type> Type;\
- my_xpr_type xprExpr;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
- : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.startIndices(), funcD.dimensions()) {}\
-};
-
-SYCLSLICEOPEXPR(const)
-SYCLSLICEOPEXPR()
-#undef SYCLSLICEOPEXPR
-
-
-#define SYCLSLICESTRIDEOPEXPR(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, OrigXprType>, CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Params... >{\
- typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
- typedef CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, typename my_xpr_type::Type> Type;\
- my_xpr_type xprExpr;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
- : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.startIndices(), funcD.stopIndices(),funcD.strides()) {}\
-};
-
-SYCLSLICESTRIDEOPEXPR(const)
-SYCLSLICESTRIDEOPEXPR()
-#undef SYCLSLICESTRIDEOPEXPR
-
-#define SYCLRESHAPEANDSHUFFLEOPEXPRCONST(OPEXPR, CVQual)\
-template<typename Param, typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\
- typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
- typedef CVQual OPEXPR <Param, typename my_xpr_type::Type> Type ;\
- my_xpr_type xprExpr;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
- : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param()) {}\
-};
-
-SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, const)
-SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, )
-
-SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, const)
-SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, )
-#undef SYCLRESHAPEANDSHUFFLEOPEXPRCONST
-
-#define SYCLPADDINGOPEXPRCONST(OPEXPR, CVQual)\
-template<typename Param, typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\
- typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
- typedef CVQual OPEXPR <Param, typename my_xpr_type::Type> Type ;\
- my_xpr_type xprExpr;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
- : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param() , funcD.scalar_param()) {}\
-};
-
-SYCLPADDINGOPEXPRCONST(TensorPaddingOp, const)
-SYCLPADDINGOPEXPRCONST(TensorPaddingOp, )
-#undef SYCLPADDINGOPEXPRCONST
-
-
-// TensorChippingOp
-#define SYCLTENSORCHIPPINGOPEXPR(CVQual)\
-template<DenseIndex DimId, typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual TensorChippingOp <DimId, OrigXprType> , CVQual TensorChippingOp<DimId, XprType>, Params... >{\
- typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
- typedef CVQual TensorChippingOp<DimId, typename my_xpr_type::Type> Type;\
- my_xpr_type xprExpr;\
- Type expr;\
- template <typename FuncDetector>\
- ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
- : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.offset(), funcD.dimId()) {}\
-};
-
-SYCLTENSORCHIPPINGOPEXPR(const)
-SYCLTENSORCHIPPINGOPEXPR()
-#undef SYCLTENSORCHIPPINGOPEXPR
-
-
/// template deduction for \ref ExprConstructor struct
template <typename OrigExpr, typename IndexExpr, typename FuncD, typename... Params>
auto createDeviceExpression(FuncD &funcD, const utility::tuple::Tuple<Params...> &t)
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
index b512d43..b1da685 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
@@ -35,8 +35,6 @@
namespace Eigen {
namespace TensorSycl {
namespace internal {
-#define RETURN_CPP11(expr) ->decltype(expr) {return expr;}
-
/// \struct ExtractAccessor: Extract Accessor Class is used to extract the
/// accessor from a buffer.
/// Depending on the type of the leaf node we can get a read accessor or a
@@ -45,192 +43,159 @@ template <typename Evaluator>
struct ExtractAccessor;
struct AccessorConstructor{
- template<typename Arg> static inline auto getTuple(cl::sycl::handler& cgh, const Arg& eval)
- RETURN_CPP11(ExtractAccessor<Arg>::getTuple(cgh, eval))
-
- template<typename Arg1, typename Arg2> static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1, const Arg2& eval2)
- RETURN_CPP11(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2)))
-
- template<typename Arg1, typename Arg2, typename Arg3> static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1 , const Arg2& eval2 , const Arg3& eval3)
- RETURN_CPP11(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3))))
-
- template< cl::sycl::access::mode AcM, typename Arg> static inline auto getAccessor(cl::sycl::handler& cgh, const Arg& eval)
- RETURN_CPP11(utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM>(cgh,eval.data())))
+ template<typename Arg> static inline auto getTuple(cl::sycl::handler& cgh, Arg eval)
+ -> decltype(ExtractAccessor<Arg>::getTuple(cgh, eval)) {
+ return ExtractAccessor<Arg>::getTuple(cgh, eval);
+ }
+
+ template<typename Arg1, typename Arg2> static inline auto getTuple(cl::sycl::handler& cgh, Arg1 eval1, Arg2 eval2)
+ -> decltype(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2))) {
+ return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2));
+ }
+ template<typename Arg1, typename Arg2, typename Arg3> static inline auto getTuple(cl::sycl::handler& cgh, Arg1 eval1 , Arg2 eval2 , Arg3 eval3)
+ -> decltype(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)))) {
+ return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)));
+ }
+ template< cl::sycl::access::mode AcM, typename Arg> static inline auto getAccessor(cl::sycl::handler& cgh, Arg eval)
+ -> decltype(utility::tuple::make_tuple( eval.device().template get_sycl_accessor<AcM,
+ typename Eigen::internal::remove_all<typename Arg::CoeffReturnType>::type>(eval.dimensions().TotalSize(), cgh,eval.data()))){
+ return utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM, typename Eigen::internal::remove_all<typename Arg::CoeffReturnType>::type>(eval.dimensions().TotalSize(), cgh,eval.data()));
+ }
};
/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// TensorCwiseNullaryOp, TensorCwiseUnaryOp and TensorBroadcastingOp
-#define SYCLUNARYCATEGORYEXTACC(CVQual)\
-template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev>& eval)\
-RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
+/// const TensorCwiseNullaryOp, const TensorCwiseUnaryOp and const TensorBroadcastingOp
+template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> > {
+ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> eval)
+ -> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){
+ return AccessorConstructor::getTuple(cgh, eval.impl());
+ }
};
-SYCLUNARYCATEGORYEXTACC(const)
-SYCLUNARYCATEGORYEXTACC()
-#undef SYCLUNARYCATEGORYEXTACC
-
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseBinaryOp
-#define SYCLBINARYCATEGORYEXTACC(CVQual)\
-template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseNullaryOp, TensorCwiseUnaryOp and TensorBroadcastingOp
+template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<UnaryCategory<OP, RHSExpr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> > {};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorCwiseBinaryOp
+template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {
+ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> eval)
+ -> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){
+ return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl());
+ }
};
-
-SYCLBINARYCATEGORYEXTACC(const)
-SYCLBINARYCATEGORYEXTACC()
-#undef SYCLBINARYCATEGORYEXTACC
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseBinaryOp
+template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> >{};
/// specialisation of the \ref ExtractAccessor struct when the node type is
/// const TensorCwiseTernaryOp
-#define SYCLTERNARYCATEGORYEXTACC(CVQual)\
-template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl()))\
+template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {
+ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> eval)
+ -> decltype(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl())){
+ return AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl());
+ }
};
-SYCLTERNARYCATEGORYEXTACC(const)
-SYCLTERNARYCATEGORYEXTACC()
-#undef SYCLTERNARYCATEGORYEXTACC
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseTernaryOp
+template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >{};
+/// specialisation of the \ref ExtractAccessor struct when the node type is
+/// const TensorCwiseSelectOp. This is a special case where there is no OP
+template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {
+ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> eval)
+ -> decltype(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl())){
+ return AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl());
+ }
+};
/// specialisation of the \ref ExtractAccessor struct when the node type is
/// TensorCwiseSelectOp. This is a special case where there is no OP
-#define SYCLSELECTOPEXTACC(CVQual)\
-template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl()))\
+template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> >{};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorAssignOp
+template <typename LHSExpr, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {
+ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> eval)
+ -> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){
+ return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl());
+ }
};
-SYCLSELECTOPEXTACC(const)
-SYCLSELECTOPEXTACC()
-#undef SYCLSELECTOPEXTACC
-
/// specialisation of the \ref ExtractAccessor struct when the node type is TensorAssignOp
-#define SYCLTENSORASSIGNOPEXTACC(CVQual)\
-template <typename LHSExpr, typename RHSExpr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\
-};
-
- SYCLTENSORASSIGNOPEXTACC(const)
- SYCLTENSORASSIGNOPEXTACC()
- #undef SYCLTENSORASSIGNOPEXTACC
+template <typename LHSExpr, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TensorAssignOp<LHSExpr, RHSExpr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> >{};
/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorMap
#define TENSORMAPEXPR(CVQual, ACCType)\
template <typename PlainObjectType, int Options_, typename Dev>\
struct ExtractAccessor<TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::template getAccessor<ACCType>(cgh, eval))\
+ static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev> eval)\
+ -> decltype(AccessorConstructor::template getAccessor<ACCType>(cgh, eval)){\
+ return AccessorConstructor::template getAccessor<ACCType>(cgh, eval);\
+ }\
};
-
TENSORMAPEXPR(const, cl::sycl::access::mode::read)
TENSORMAPEXPR(, cl::sycl::access::mode::read_write)
#undef TENSORMAPEXPR
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorForcedEvalOp
-#define SYCLFORCEDEVALEXTACC(CVQual)\
-template <typename Expr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorForcedEvalOp<Expr>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorForcedEvalOp<Expr>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorForcedEvalOp
+template <typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> > {
+ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> eval)
+ -> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){
+ return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);
+ }
};
-SYCLFORCEDEVALEXTACC(const)
-SYCLFORCEDEVALEXTACC()
-#undef SYCLFORCEDEVALEXTACC
-
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorForcedEvalOp
+template <typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TensorForcedEvalOp<Expr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> >{};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorEvalToOp
+template <typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TensorEvalToOp<Expr>, Dev> > {
+ static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<const TensorEvalToOp<Expr>, Dev> eval)
+ -> decltype(utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()))){
+ return utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()));
+ }
+};
/// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp
-#define SYCLEVALTOEXTACC(CVQual)\
-template <typename Expr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorEvalToOp<Expr>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorEvalToOp<Expr>, Dev>& eval)\
- RETURN_CPP11(utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl())))\
+template <typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TensorEvalToOp<Expr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TensorEvalToOp<Expr>, Dev> >{};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorReductionOp
+template <typename OP, typename Dim, typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> > {
+ static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> eval)
+ -> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){
+ return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);
+ }
};
-SYCLEVALTOEXTACC(const)
-SYCLEVALTOEXTACC()
-#undef SYCLEVALTOEXTACC
-
/// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp
-#define SYCLREDUCTIONEXTACC(CVQual)\
-template <typename OP, typename Dim, typename Expr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
-};
-
-SYCLREDUCTIONEXTACC(const)
-SYCLREDUCTIONEXTACC()
-#undef SYCLREDUCTIONEXTACC
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorContractionOp and TensorConvolutionOp
-#define SYCLCONTRACTIONCONVOLUTIONEXTACC(CVQual, ExprNode)\
-template<typename Indices, typename LhsXprType, typename RhsXprType, typename Dev>\
- struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
-};
-
-SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorConvolutionOp)
-SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorConvolutionOp)
-#undef SYCLCONTRACTIONCONVOLUTIONEXTACC
-
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// const TensorSlicingOp.
-#define SYCLSLICEOPEXTACC(CVQual)\
-template <typename StartIndices, typename Sizes, typename XprType, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev> > {\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev>& eval)\
- RETURN_CPP11( AccessorConstructor::getTuple(cgh, eval.impl()))\
-};
-
-SYCLSLICEOPEXTACC(const)
-SYCLSLICEOPEXTACC()
-#undef SYCLSLICEOPEXTACC
-// specialisation of the \ref ExtractAccessor struct when the node type is
-/// TensorStridingSlicingOp.
-#define SYCLSLICESTRIDEOPEXTACC(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev> >{\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
-};
-
-SYCLSLICESTRIDEOPEXTACC(const)
-SYCLSLICESTRIDEOPEXTACC()
-#undef SYCLSLICESTRIDEOPEXTACC
-
-// specialisation of the \ref ExtractAccessor struct when the node type is
-/// TensorChippingOp.
-#define SYCLTENSORCHIPPINGOPEXTACC(CVQual)\
-template<DenseIndex DimId, typename XprType, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Dev> >{\
- static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Dev>& eval)\
- RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
-};
-
-SYCLTENSORCHIPPINGOPEXTACC(const)
-SYCLTENSORCHIPPINGOPEXTACC()
-#undef SYCLTENSORCHIPPINGOPEXTACC
-
+template <typename OP, typename Dim, typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TensorReductionOp<OP, Dim, Expr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> >{};
/// template deduction for \ref ExtractAccessor
template <typename Evaluator>
-auto createTupleOfAccessors(cl::sycl::handler& cgh, const Evaluator& eval)
--> decltype(ExtractAccessor<Evaluator>::getTuple(cgh, eval)) {
- return ExtractAccessor<Evaluator>::getTuple(cgh, eval);
+auto createTupleOfAccessors(cl::sycl::handler& cgh, const Evaluator& expr)
+-> decltype(ExtractAccessor<Evaluator>::getTuple(cgh, expr)) {
+ return ExtractAccessor<Evaluator>::getTuple(cgh, expr);
}
} /// namespace TensorSycl
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
index ee02018..4271253 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
@@ -36,277 +36,135 @@ namespace internal {
template <typename Evaluator> struct FunctorExtractor{
typedef typename Evaluator::Dimensions Dimensions;
const Dimensions m_dimensions;
- EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+ const Dimensions& dimensions() const { return m_dimensions; }
FunctorExtractor(const Evaluator& expr)
: m_dimensions(expr.dimensions()) {}
};
-/// specialisation of the \ref FunctorExtractor struct when the node type does not require anything
-///TensorConversionOp
-#define SYCLEXTRFUNCCONVERSION(ExprNode, CVQual)\
-template <typename ArgType1, typename ArgType2, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<ArgType1, ArgType2>, Dev> > {\
- FunctorExtractor<TensorEvaluator<ArgType2, Dev> > subExpr;\
- FunctorExtractor(const TensorEvaluator<CVQual ExprNode<ArgType1, ArgType2>, Dev>& expr)\
- : subExpr(expr.impl()) {}\
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// const TensorCwiseNullaryOp, const TensorCwiseUnaryOp, and const TensorBroadcastingOp
+template <template <class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> > {
+ FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
+ OP func;
+ FunctorExtractor(const TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev>& expr)
+ : rhsExpr(expr.impl()), func(expr.functor()) {}
};
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// TensorCwiseNullaryOp, TensorCwiseUnaryOp, and TensorBroadcastingOp
+template <template <class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<UnaryCategory<OP, RHSExpr>, Dev> >
+: FunctorExtractor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> >{};
-SYCLEXTRFUNCCONVERSION(TensorConversionOp, const)
-SYCLEXTRFUNCCONVERSION(TensorConversionOp, )
-#undef SYCLEXTRFUNCCONVERSION
-
-#define SYCLEXTRTENSORMAPFIXEDSIZE(CVQual)\
-template <typename Scalar_, typename Dimensions_, int Options_2, typename IndexType, int Options_, template <class> class MakePointer_, typename Dev>\
-struct FunctorExtractor< TensorEvaluator <CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_> , Dev> >{\
-FunctorExtractor(const TensorEvaluator <CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_> , Dev>& ){}\
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// const TensorCwiseBinaryOp
+template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {
+ FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;
+ FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
+ OP func;
+ FunctorExtractor(const TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& expr)
+ : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.functor()) {}
};
-SYCLEXTRTENSORMAPFIXEDSIZE(const)
-SYCLEXTRTENSORMAPFIXEDSIZE()
-#undef SYCLEXTRTENSORMAPFIXEDSIZE
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// const TensorCwiseBinaryOp
+template <template <class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> >
+: FunctorExtractor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> >{};
/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// TensorCwiseNullaryOp, TensorCwiseUnaryOp, and TensorBroadcastingOp
-#define SYCLEXTRFUNCUNARY(CVQual)\
-template <template <class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev> > {\
- FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
- const OP func;\
- FunctorExtractor(const TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev>& expr)\
- : rhsExpr(expr.impl()), func(expr.functor()) {}\
+/// const TensorCwiseTernaryOp
+template <template <class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr,typename Dev>
+struct FunctorExtractor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {
+ FunctorExtractor<TensorEvaluator<Arg1Expr, Dev> > arg1Expr;
+ FunctorExtractor<TensorEvaluator<Arg2Expr, Dev> > arg2Expr;
+ FunctorExtractor<TensorEvaluator<Arg3Expr, Dev> > arg3Expr;
+ OP func;
+ FunctorExtractor(const TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& expr)
+ : arg1Expr(expr.arg1Impl()), arg2Expr(expr.arg2Impl()), arg3Expr(expr.arg3Impl()), func(expr.functor()) {}
};
-SYCLEXTRFUNCUNARY(const)
-SYCLEXTRFUNCUNARY()
-#undef SYCLEXTRFUNCUNARY
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// TensorCwiseTernaryOp
+template <template <class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>
+struct FunctorExtractor<TensorEvaluator< TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >
+:FunctorExtractor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >{};
/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// TensorCwiseBinaryOp
-#define SYCLEXTRFUNCBIINARY(CVQual)\
-template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {\
- FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\
- FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
- const OP func;\
- FunctorExtractor(const TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& expr)\
- : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.functor()) {}\
+/// const TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated.
+template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
+struct FunctorExtractor< TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {
+ FunctorExtractor<TensorEvaluator<IfExpr, Dev> > ifExpr;
+ FunctorExtractor<TensorEvaluator<ThenExpr, Dev> > thenExpr;
+ FunctorExtractor<TensorEvaluator<ElseExpr, Dev> > elseExpr;
+ FunctorExtractor(const TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& expr)
+ : ifExpr(expr.cond_impl()), thenExpr(expr.then_impl()), elseExpr(expr.else_impl()) {}
};
-SYCLEXTRFUNCBIINARY(const)
-SYCLEXTRFUNCBIINARY()
-#undef SYCLEXTRFUNCBIINARY
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated
+template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> >
+:FunctorExtractor< TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {};
-/// specialisation of the \ref FunctorExtractor struct when the node type is TensorCwiseTernaryOp
-#define SYCLEXTRFUNCTERNARY(CVQual)\
-template <template <class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr,typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {\
- FunctorExtractor<TensorEvaluator<Arg1Expr, Dev> > arg1Expr;\
- FunctorExtractor<TensorEvaluator<Arg2Expr, Dev> > arg2Expr;\
- FunctorExtractor<TensorEvaluator<Arg3Expr, Dev> > arg3Expr;\
- const OP func;\
- FunctorExtractor(const TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& expr)\
- : arg1Expr(expr.arg1Impl()), arg2Expr(expr.arg2Impl()), arg3Expr(expr.arg3Impl()), func(expr.functor()) {}\
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// const TensorAssignOp. This is an specialisation without OP so it has to be separated.
+template <typename LHSExpr, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {
+ FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;
+ FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
+ FunctorExtractor(const TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev>& expr)
+ : lhsExpr(expr.left_impl()), rhsExpr(expr.right_impl()) {}
};
-SYCLEXTRFUNCTERNARY(const)
-SYCLEXTRFUNCTERNARY()
-#undef SYCLEXTRFUNCTERNARY
-
/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated.
-#define SYCLEXTRFUNCSELECTOP(CVQual)\
-template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>\
-struct FunctorExtractor< TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {\
- FunctorExtractor<TensorEvaluator<IfExpr, Dev> > ifExpr;\
- FunctorExtractor<TensorEvaluator<ThenExpr, Dev> > thenExpr;\
- FunctorExtractor<TensorEvaluator<ElseExpr, Dev> > elseExpr;\
- FunctorExtractor(const TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& expr)\
- : ifExpr(expr.cond_impl()), thenExpr(expr.then_impl()), elseExpr(expr.else_impl()) {}\
-};
+/// TensorAssignOp. This is an specialisation without OP so it has to be separated.
+template <typename LHSExpr, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<TensorAssignOp<LHSExpr, RHSExpr>, Dev> >
+:FunctorExtractor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> >{};
-SYCLEXTRFUNCSELECTOP(const)
-SYCLEXTRFUNCSELECTOP()
-#undef SYCLEXTRFUNCSELECTOP
/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// const TensorAssignOp. This is an specialisation without OP so it has to be separated.
-#define SYCLEXTRFUNCASSIGNOP(CVQual)\
-template <typename LHSExpr, typename RHSExpr, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {\
- FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\
- FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
- FunctorExtractor(const TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev>& expr)\
- : lhsExpr(expr.left_impl()), rhsExpr(expr.right_impl()) {}\
+/// const TensorEvalToOp, This is an specialisation without OP so it has to be separated.
+template <typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<const TensorEvalToOp<RHSExpr>, Dev> > {
+ FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
+ FunctorExtractor(const TensorEvaluator<const TensorEvalToOp<RHSExpr>, Dev>& expr)
+ : rhsExpr(expr.impl()) {}
};
-SYCLEXTRFUNCASSIGNOP(const)
-SYCLEXTRFUNCASSIGNOP()
-#undef SYCLEXTRFUNCASSIGNOP
/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// TensorEvalToOp, This is an specialisation without OP so it has to be separated.
-#define SYCLEXTRFUNCEVALTOOP(CVQual)\
-template <typename RHSExpr, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorEvalToOp<RHSExpr>, Dev> > {\
- FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
- FunctorExtractor(const TensorEvaluator<CVQual TensorEvalToOp<RHSExpr>, Dev>& expr)\
- : rhsExpr(expr.impl()) {}\
-};
-
-SYCLEXTRFUNCEVALTOOP(const)
-SYCLEXTRFUNCEVALTOOP()
-#undef SYCLEXTRFUNCEVALTOOP
+/// TensorEvalToOp. This is a specialisation without OP so it has to be separated.
+template <typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<TensorEvalToOp<RHSExpr>, Dev> >
+: FunctorExtractor<TensorEvaluator<const TensorEvalToOp<RHSExpr>, Dev> > {};
template<typename Dim, size_t NumOutputDim> struct DimConstr {
template<typename InDim>
- static EIGEN_STRONG_INLINE Dim getDim(InDim dims ) {return dims;}
+ static inline Dim getDim(InDim dims ) {return dims;}
};
template<typename Dim> struct DimConstr<Dim, 0> {
template<typename InDim>
- static EIGEN_STRONG_INLINE Dim getDim(InDim dims ) {return Dim(static_cast<Dim>(dims.TotalSize()));}
-};
-
-#define SYCLEXTRFUNCREDUCTIONOP(CVQual)\
-template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{\
- typedef TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Evaluator;\
- typedef typename Eigen::internal::conditional<Evaluator::NumOutputDims==0, DSizes<typename Evaluator::Index, 1>, typename Evaluator::Dimensions >::type Dimensions;\
- const Dimensions m_dimensions;\
- EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
- FunctorExtractor(const TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>& expr)\
- : m_dimensions(DimConstr<Dimensions, Evaluator::NumOutputDims>::getDim(expr.dimensions())) {}\
-};
-
-
-SYCLEXTRFUNCREDUCTIONOP(const)
-SYCLEXTRFUNCREDUCTIONOP()
-#undef SYCLEXTRFUNCREDUCTIONOP
-
-#define SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(CVQual, ExprNode)\
-template<typename Indices, typename LhsXprType, typename RhsXprType, typename Device>\
-struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>>{\
- typedef TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device> Evaluator;\
- typedef typename Evaluator::Dimensions Dimensions;\
- const Dimensions m_dimensions;\
- EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
- FunctorExtractor(const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>& expr)\
- : m_dimensions(expr.dimensions()) {}\
+ static inline Dim getDim(InDim dims ) {return Dim(dims.TotalSize());}
};
-
-SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorContractionOp)
-SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorContractionOp)
-SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorConvolutionOp)
-SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorConvolutionOp)
-#undef SYCLEXTRFUNCCONTRACTCONVOLUTIONOP
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// const TensorSlicingOp. This is an specialisation without OP so it has to be separated.
-#define SYCLEXTRFUNCTSLICEOP(CVQual)\
-template <typename StartIndices, typename Sizes, typename XprType, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev> > {\
- FunctorExtractor<TensorEvaluator<XprType, Dev> > xprExpr;\
- const StartIndices m_offsets;\
- const Sizes m_dimensions;\
- FunctorExtractor(const TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev>& expr)\
- : xprExpr(expr.impl()), m_offsets(expr.startIndices()), m_dimensions(expr.dimensions()) {}\
- EIGEN_STRONG_INLINE const StartIndices& startIndices() const {return m_offsets;}\
- EIGEN_STRONG_INLINE const Sizes& dimensions() const {return m_dimensions;}\
-};
-
-SYCLEXTRFUNCTSLICEOP(const)
-SYCLEXTRFUNCTSLICEOP()
-#undef SYCLEXTRFUNCTSLICEOP
-
-#define SYCLEXTRFUNCTSLICESTRIDEOP(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev> >{\
- FunctorExtractor<TensorEvaluator<XprType, Dev> > xprExpr;\
- const StartIndices m_startIndices;\
- const StopIndices m_stopIndices;\
- const Strides m_strides;\
- FunctorExtractor(const TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices,Strides, XprType>, Dev>& expr)\
- : xprExpr(expr.impl()), m_startIndices(expr.exprStartIndices()), m_stopIndices(expr.exprStopIndices()), m_strides(expr.strides()) {}\
- EIGEN_STRONG_INLINE const StartIndices& startIndices() const { return m_startIndices; }\
- EIGEN_STRONG_INLINE const StartIndices& stopIndices() const { return m_stopIndices; }\
- EIGEN_STRONG_INLINE const StartIndices& strides() const { return m_strides; }\
-};
-
-SYCLEXTRFUNCTSLICESTRIDEOP(const)
-SYCLEXTRFUNCTSLICESTRIDEOP()
-#undef SYCLEXTRFUNCTSLICESTRIDEOP
-
-// Had to separate reshapeOP otherwise it will be mistaken by UnaryCategory
-#define SYCLRESHAPEANDSHUFFLEOPFUNCEXT(OPEXPR, FUNCCALL, CVQual)\
-template<typename Param, typename XprType, typename Dev>\
-struct FunctorExtractor<Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev> > {\
- FunctorExtractor<Eigen::TensorEvaluator<XprType, Dev> > xprExpr;\
- const Param m_param;\
- EIGEN_STRONG_INLINE const Param& param() const { return m_param; }\
- FunctorExtractor(const Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev>& expr)\
- : xprExpr(expr.impl()), m_param(expr.FUNCCALL) {}\
-};
-
-SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), const)
-SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), )
-
-SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), const)
-SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), )
-#undef SYCLRESHAPEANDSHUFFLEOPFUNCEXT
-
-// Had to separate reshapeOP otherwise it will be mistaken by UnaryCategory
-#define PADDINGOPFUNCEXT(OPEXPR, FUNCCALL, SCALARFUNCCALL, CVQual)\
-template<typename Param, typename XprType, typename Dev>\
-struct FunctorExtractor<Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev> > {\
- FunctorExtractor<Eigen::TensorEvaluator<XprType, Dev> > xprExpr;\
- const Param m_param;\
- typedef typename Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev>::Scalar Scalar;\
- const Scalar m_scalar_param;\
- EIGEN_STRONG_INLINE const Param& param() const { return m_param; }\
- EIGEN_STRONG_INLINE const Scalar& scalar_param() const { return m_scalar_param; }\
- FunctorExtractor(const Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev>& expr)\
- : xprExpr(expr.impl()), m_param(expr.FUNCCALL), m_scalar_param(expr.SCALARFUNCCALL) {}\
-};
-
-PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), const)
-PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), )
-#undef PADDINGOPFUNCEXT
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is TensorContractionOp and TensorConcatenationOp
-/// for TensorContractionOp the LHS and RHS here are the original one no need to apply condition on their type.
-#define SYCLEXTRFUNCCONTRACTCONCAT(OPEXPR, FUNCCALL, CVQual)\
-template <typename Param, typename LHSExpr, typename RHSExpr, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual OPEXPR<Param, LHSExpr, RHSExpr>, Dev> > {\
- FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\
- FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
- const Param func;\
- FunctorExtractor(const TensorEvaluator<CVQual OPEXPR<Param, LHSExpr, RHSExpr>, Dev>& expr)\
- : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.FUNCCALL) {}\
-};
-
-// TensorConcatenationOp
-SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(), const)
-SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(),)
-#undef SYCLEXTRFUNCCONTRACTCONCAT
-
-//TensorChippingOp
-#define SYCLEXTRFUNCCHIPPINGOP(CVQual)\
-template<DenseIndex DimId, typename XprType, typename Device>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Device>>{\
- FunctorExtractor<Eigen::TensorEvaluator<XprType, Device> > xprExpr;\
- const DenseIndex m_dim;\
- const DenseIndex m_offset;\
- EIGEN_STRONG_INLINE const DenseIndex& dimId() const { return m_dim; }\
- EIGEN_STRONG_INLINE const DenseIndex& offset() const { return m_offset; }\
- FunctorExtractor(const TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Device>& expr)\
- : xprExpr(expr.impl()), m_dim(expr.dimId()), m_offset(expr.offset()) {}\
+template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
+struct FunctorExtractor<TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{
+ typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Evaluator;
+ typedef typename Eigen::internal::conditional<Evaluator::NumOutputDims==0, DSizes<typename Evaluator::Index, 1>, typename Evaluator::Dimensions >::type Dimensions;
+ const Dimensions m_dimensions;
+ const Dimensions& dimensions() const { return m_dimensions; }
+ FunctorExtractor(const TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>& expr)
+ : m_dimensions(DimConstr<Dimensions, Evaluator::NumOutputDims>::getDim(expr.dimensions())) {}
};
-SYCLEXTRFUNCCHIPPINGOP(const)
-SYCLEXTRFUNCCHIPPINGOP()
-#undef SYCLEXTRFUNCCHIPPINGOP
+template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
+struct FunctorExtractor<TensorEvaluator<TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>
+: FunctorExtractor<TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{};
/// template deduction function for FunctorExtractor
template <typename Evaluator>
auto inline extractFunctors(const Evaluator& evaluator)-> FunctorExtractor<Evaluator> {
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
deleted file mode 100644
index 2f77790..0000000
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
+++ /dev/null
@@ -1,245 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli Codeplay Software Ltd.
-// Ralph Potter Codeplay Software Ltd.
-// Luke Iwanski Codeplay Software Ltd.
-// Contact: eigen@codeplay.com
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// General include header of SYCL target for Tensor Module
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-
- template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{
- OP op;
- OutputAccessor aOut;
- InputAccessor aI;
- LocalAccessor scratch;
- size_t length, local;
- GenericKernelReducer(OP op_, OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
- : op(op_), aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
- void operator()(cl::sycl::nd_item<1> itemID) {
- size_t globalid = itemID.get_global(0);
- size_t localid = itemID.get_local(0);
- /* All threads collectively read from global memory into local.
- * The barrier ensures all threads' IO is resolved before
- * execution continues (strictly speaking, all threads within
- * a single work-group - there is no co-ordination between
- * work-groups, only work-items). */
- if (globalid < length) {
- scratch[localid] = aI[globalid];
- }
- itemID.barrier(cl::sycl::access::fence_space::local_space);
-
- /* Apply the reduction operation between the current local
- * id and the one on the other half of the vector. */
- if (globalid < length) {
- auto min = (length < local) ? length : local;
- for (size_t offset = min / 2; offset > 0; offset /= 2) {
- if (localid < offset) {
- auto accum = op.initialize();
- op.reduce(scratch[localid], &accum);
- op.reduce(scratch[localid + offset], &accum);
- op.finalize(accum);
- scratch[localid]=accum;
- //scratch[localid] += scratch[localid + offset];
- }
- itemID.barrier(cl::sycl::access::fence_space::local_space);
- }
- /* The final result will be stored in local id 0. */
- if (localid == 0) {
- aI[itemID.get_group(0)] = scratch[localid];
- if((length<=local) && globalid ==0){
- auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut);
- aOutPtr[0]=scratch[0];
- }
- }
- }
- }
-
- };
-
-/// ReductionFunctor
-template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor {
- public:
- typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
- typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
- ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index)
- :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {}
- void operator()(cl::sycl::nd_item<1> itemID) {
-
- typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
- auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
- /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
- /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
- const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
- /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
- /// the device_evaluator is detectable and recognisable on the device.
- typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf;
- auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
- auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
- /// const cast added as a naive solution to solve the qualifier drop error
- auto globalid=static_cast<Index>(itemID.get_global_linear_id());
- if (globalid< range) {
- typename DeviceSelf::CoeffReturnType accum = functor.initialize();
- Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
- functor.finalize(accum);
- output_accessor_ptr[globalid]= accum;
- }
- }
- private:
- write_accessor output_accessor;
- FunctorExpr functors;
- Tuple_of_Acc tuple_of_accessors;
- Dims dims;
- Op functor;
- Index range;
-};
-
-template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Index>
-class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index> {
- public:
- typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
- typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
- typedef Eigen::internal::SumReducer<typename HostExpr::CoeffReturnType> Op;
- ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_,
- Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index range_, Index num_values_to_reduce_)
- :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {}
- void operator()(cl::sycl::nd_item<1> itemID) {
-
- typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
- auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
- /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
- /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
- const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
- /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
- /// the device_evaluator is detectable and recognisable on the device.
- typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf;
- auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
- auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
- /// const cast added as a naive solution to solve the qualifier drop error
- auto globalid=static_cast<Index>(itemID.get_global_linear_id());
- if (globalid< range) {
- typename DeviceSelf::CoeffReturnType accum = functor.initialize();
- Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
- functor.finalize(accum);
- output_accessor_ptr[globalid]= accum/num_values_to_reduce;
- }
- }
- private:
- write_accessor output_accessor;
- FunctorExpr functors;
- Tuple_of_Acc tuple_of_accessors;
- Dims dims;
- Op functor;
- Index range;
- Index num_values_to_reduce;
-};
-
-template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Op, typename Dims, typename Index, typename TupleType>
-class FullReductionKernelFunctor{
-public:
- typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
- OutAccessor tmp_global_accessor;
- Index rng , remaining, red_factor;
- Op op;
- Dims dims;
- FunctorExpr functors;
- TupleType tuple_of_accessors;
-
- FullReductionKernelFunctor(OutAccessor acc, Index rng_, Index remaining_, Index red_factor_, Op op_, Dims dims_, FunctorExpr functors_, TupleType t_acc)
- :tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(op_), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){}
-
- void operator()(cl::sycl::nd_item<1> itemID) {
-
- typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
- auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
- /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
- /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
- const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
- /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
- /// the device_evaluator is detectable and recognisable on the device.
- auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
- /// const cast added as a naive solution to solve the qualifier drop error
- auto globalid=itemID.get_global_linear_id();
-
- tmp_global_accessor.get_pointer()[globalid]=(globalid<rng) ? Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op))
- : static_cast<CoeffReturnType>(op.initialize());
-
- if(remaining!=0 && globalid==0 ){
- // this will add the rest of input buffer when the input size is not devidable to red_factor.
- auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::
- reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
- auto accum = op.initialize();
- op.reduce(tmp_global_accessor.get_pointer()[0], &accum);
- op.reduce(remaining_reduce, &accum);
- op.finalize(accum);
- tmp_global_accessor.get_pointer()[0]=accum;
-
- }
- }
-};
-
-template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Dims, typename Index, typename TupleType>
-class FullReductionKernelFunctor<CoeffReturnType, OutAccessor, HostExpr, FunctorExpr, Eigen::internal::MeanReducer<CoeffReturnType>, Dims, Index, TupleType>{
-public:
- typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
- typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
-
- OutAccessor tmp_global_accessor;
- Index rng , remaining, red_factor;
- Op op;
- Dims dims;
- FunctorExpr functors;
- TupleType tuple_of_accessors;
-
- FullReductionKernelFunctor(OutAccessor acc, Index rng_, Index remaining_, Index red_factor_, Eigen::internal::MeanReducer<CoeffReturnType>, Dims dims_, FunctorExpr functors_, TupleType t_acc)
- :tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(Op()), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){}
-
- void operator()(cl::sycl::nd_item<1> itemID) {
-
- typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
- auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
- /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
- /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
- const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
- /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
- /// the device_evaluator is detectable and recognisable on the device.
- auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
- /// const cast added as a naive solution to solve the qualifier drop error
- auto globalid=itemID.get_global_linear_id();
- auto scale = (rng*red_factor) + remaining;
-
- tmp_global_accessor.get_pointer()[globalid]= (globalid<rng)? ((Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op)))/scale)
- :static_cast<CoeffReturnType>(op.initialize())/scale;
-
- if(remaining!=0 && globalid==0 ){
- // this will add the rest of input buffer when the input size is not devidable to red_factor.
- auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
- auto accum = op.initialize();
- tmp_global_accessor.get_pointer()[0]= tmp_global_accessor.get_pointer()[0]*scale;
- op.reduce(tmp_global_accessor.get_pointer()[0], &accum);
- op.reduce(remaining_reduce, &accum);
- op.finalize(accum);
- tmp_global_accessor.get_pointer()[0]=accum/scale;
-
- }
- }
-};
-
-}
-}
-}
-#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
index a1c112f..25d1fac 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
@@ -44,120 +44,68 @@ struct CategoryCount<Arg,Args...>{
};
/// specialisation of the \ref LeafCount struct when the node type is const TensorMap
-#define SYCLTENSORMAPLEAFCOUNT(CVQual)\
-template <typename PlainObjectType, int Options_, template <class> class MakePointer_>\
-struct LeafCount<CVQual TensorMap<PlainObjectType, Options_, MakePointer_> > {\
- static const size_t Count =1;\
+template <typename PlainObjectType, int Options_, template <class> class MakePointer_>
+struct LeafCount<const TensorMap<PlainObjectType, Options_, MakePointer_> > {
+ static const size_t Count =1;
};
-SYCLTENSORMAPLEAFCOUNT(const)
-SYCLTENSORMAPLEAFCOUNT()
-#undef SYCLTENSORMAPLEAFCOUNT
+/// specialisation of the \ref LeafCount struct when the node type is TensorMap
+template <typename PlainObjectType, int Options_, template <class> class MakePointer_>
+struct LeafCount<TensorMap<PlainObjectType, Options_, MakePointer_> > :LeafCount<const TensorMap<PlainObjectType, Options_, MakePointer_> >{};
-// TensorCwiseUnaryOp, TensorCwiseNullaryOp, TensorCwiseBinaryOp, TensorCwiseTernaryOp, and TensorBroadcastingOp
-#define SYCLCATEGORYLEAFCOUNT(CVQual)\
-template <template <class, class...> class CategoryExpr, typename OP, typename... RHSExpr>\
-struct LeafCount<CVQual CategoryExpr<OP, RHSExpr...> >: CategoryCount<RHSExpr...> {};
-
-SYCLCATEGORYLEAFCOUNT(const)
-SYCLCATEGORYLEAFCOUNT()
-#undef SYCLCATEGORYLEAFCOUNT
+// const TensorCwiseUnaryOp, const TensorCwiseNullaryOp, const TensorCwiseBinaryOp, const TensorCwiseTernaryOp, and Const TensorBroadcastingOp
+template <template <class, class...> class CategoryExpr, typename OP, typename... RHSExpr>
+struct LeafCount<const CategoryExpr<OP, RHSExpr...> >: CategoryCount<RHSExpr...> {};
+// TensorCwiseUnaryOp, TensorCwiseNullaryOp, TensorCwiseBinaryOp, TensorCwiseTernaryOp, and TensorBroadcastingOp
+template <template <class, class...> class CategoryExpr, typename OP, typename... RHSExpr>
+struct LeafCount<CategoryExpr<OP, RHSExpr...> > :LeafCount<const CategoryExpr<OP, RHSExpr...> >{};
/// specialisation of the \ref LeafCount struct when the node type is const TensorSelectOp is an exception
-#define SYCLSELECTOPLEAFCOUNT(CVQual)\
-template <typename IfExpr, typename ThenExpr, typename ElseExpr>\
-struct LeafCount<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr> > : CategoryCount<IfExpr, ThenExpr, ElseExpr> {};
-
-SYCLSELECTOPLEAFCOUNT(const)
-SYCLSELECTOPLEAFCOUNT()
-#undef SYCLSELECTOPLEAFCOUNT
+template <typename IfExpr, typename ThenExpr, typename ElseExpr>
+struct LeafCount<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr> > : CategoryCount<IfExpr, ThenExpr, ElseExpr> {};
+/// specialisation of the \ref LeafCount struct when the node type is TensorSelectOp
+template <typename IfExpr, typename ThenExpr, typename ElseExpr>
+struct LeafCount<TensorSelectOp<IfExpr, ThenExpr, ElseExpr> >: LeafCount<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr> > {};
-/// specialisation of the \ref LeafCount struct when the node type is TensorAssignOp
-#define SYCLLEAFCOUNTASSIGNOP(CVQual)\
-template <typename LHSExpr, typename RHSExpr>\
-struct LeafCount<CVQual TensorAssignOp<LHSExpr, RHSExpr> >: CategoryCount<LHSExpr,RHSExpr> {};
+/// specialisation of the \ref LeafCount struct when the node type is const TensorAssignOp
+template <typename LHSExpr, typename RHSExpr>
+struct LeafCount<const TensorAssignOp<LHSExpr, RHSExpr> >: CategoryCount<LHSExpr,RHSExpr> {};
-SYCLLEAFCOUNTASSIGNOP(const)
-SYCLLEAFCOUNTASSIGNOP()
-#undef SYCLLEAFCOUNTASSIGNOP
+/// specialisation of the \ref LeafCount struct when the node type is
+/// TensorAssignOp is an exception. It is not the same as Unary
+template <typename LHSExpr, typename RHSExpr>
+struct LeafCount<TensorAssignOp<LHSExpr, RHSExpr> > :LeafCount<const TensorAssignOp<LHSExpr, RHSExpr> >{};
/// specialisation of the \ref LeafCount struct when the node type is const TensorForcedEvalOp
-#define SYCLFORCEDEVALLEAFCOUNT(CVQual)\
-template <typename Expr>\
-struct LeafCount<CVQual TensorForcedEvalOp<Expr> > {\
- static const size_t Count =1;\
+template <typename Expr>
+struct LeafCount<const TensorForcedEvalOp<Expr> > {
+ static const size_t Count =1;
};
-SYCLFORCEDEVALLEAFCOUNT(const)
-SYCLFORCEDEVALLEAFCOUNT()
-#undef SYCLFORCEDEVALLEAFCOUNT
+/// specialisation of the \ref LeafCount struct when the node type is TensorForcedEvalOp
+template <typename Expr>
+struct LeafCount<TensorForcedEvalOp<Expr> >: LeafCount<const TensorForcedEvalOp<Expr> > {};
-/// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp
-#define EVALTOLEAFCOUNT(CVQual)\
-template <typename Expr>\
-struct LeafCount<CVQual TensorEvalToOp<Expr> > {\
- static const size_t Count = 1 + CategoryCount<Expr>::Count;\
+/// specialisation of the \ref LeafCount struct when the node type is const TensorEvalToOp
+template <typename Expr>
+struct LeafCount<const TensorEvalToOp<Expr> > {
+ static const size_t Count = 1 + CategoryCount<Expr>::Count;
};
-EVALTOLEAFCOUNT(const)
-EVALTOLEAFCOUNT()
-#undef EVALTOLEAFCOUNT
-
/// specialisation of the \ref LeafCount struct when the node type is const TensorReductionOp
-#define REDUCTIONLEAFCOUNT(CVQual)\
-template <typename OP, typename Dim, typename Expr>\
-struct LeafCount<CVQual TensorReductionOp<OP, Dim, Expr> > {\
- static const size_t Count =1;\
+template <typename OP, typename Dim, typename Expr>
+struct LeafCount<const TensorReductionOp<OP, Dim, Expr> > {
+ static const size_t Count =1;
};
-REDUCTIONLEAFCOUNT(const)
-REDUCTIONLEAFCOUNT()
-#undef REDUCTIONLEAFCOUNT
-
-/// specialisation of the \ref LeafCount struct when the node type is const TensorContractionOp
-#define CONTRACTIONCONVOLUTIONLEAFCOUNT(CVQual, ExprNode)\
-template <typename Indices, typename LhsXprType, typename RhsXprType>\
-struct LeafCount<CVQual ExprNode<Indices, LhsXprType, RhsXprType> > {\
- static const size_t Count =1;\
-};
-
-CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorContractionOp)
-CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorContractionOp)
-CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorConvolutionOp)
-CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorConvolutionOp)
-#undef CONTRACTIONCONVOLUTIONLEAFCOUNT
-
-
-
-/// specialisation of the \ref LeafCount struct when the node type is TensorSlicingOp
-#define SLICEOPLEAFCOUNT(CVQual)\
-template <typename StartIndices, typename Sizes, typename XprType>\
-struct LeafCount<CVQual TensorSlicingOp<StartIndices, Sizes, XprType> >:CategoryCount<XprType>{};
-
-SLICEOPLEAFCOUNT(const)
-SLICEOPLEAFCOUNT()
-#undef SLICEOPLEAFCOUNT
-
-
-/// specialisation of the \ref LeafCount struct when the node type is TensorChippingOp
-#define CHIPPINGOPLEAFCOUNT(CVQual)\
-template <DenseIndex DimId, typename XprType>\
-struct LeafCount<CVQual TensorChippingOp<DimId, XprType> >:CategoryCount<XprType>{};
-
-CHIPPINGOPLEAFCOUNT(const)
-CHIPPINGOPLEAFCOUNT()
-#undef CHIPPINGOPLEAFCOUNT
-
-
-#define SLICESTRIDEOPLEAFCOUNT(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>\
-struct LeafCount<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >:CategoryCount<XprType>{};
-
-SLICESTRIDEOPLEAFCOUNT(const)
-SLICESTRIDEOPLEAFCOUNT()
-#undef SLICESTRIDEOPLEAFCOUNT
+/// specialisation of the \ref LeafCount struct when the node type is TensorReductionOp
+template <typename OP, typename Dim, typename Expr>
+struct LeafCount<TensorReductionOp<OP, Dim, Expr> >: LeafCount<const TensorReductionOp<OP, Dim, Expr> >{};
+/// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp
+template <typename Expr>
+struct LeafCount<TensorEvalToOp<Expr> >: LeafCount<const TensorEvalToOp<Expr> >{};
} /// namespace TensorSycl
} /// namespace internal
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
index 74566dc..d4c250c 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
@@ -122,9 +122,9 @@ ASSIGNEXPR()
/// specialisation of the \ref PlaceHolderExpression when the node is
/// TensorMap
#define TENSORMAPEXPR(CVQual)\
-template <typename T, int Options_, template <class> class MakePointer_, size_t N>\
-struct PlaceHolderExpression< CVQual TensorMap< T, Options_, MakePointer_>, N> {\
- typedef CVQual PlaceHolder<CVQual TensorMap<T, Options_, MakePointer_>, N> Type;\
+template <typename Scalar_, int Options_, int Options2_, int NumIndices_, typename IndexType_, template <class> class MakePointer_, size_t N>\
+struct PlaceHolderExpression< CVQual TensorMap< Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakePointer_>, N> {\
+ typedef CVQual PlaceHolder<CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakePointer_>, N> Type;\
};
TENSORMAPEXPR(const)
@@ -157,18 +157,6 @@ EVALTO()
/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorChippingOp
-#define CHIPPINGOP(CVQual)\
-template <DenseIndex DimId, typename Expr, size_t N>\
-struct PlaceHolderExpression<CVQual TensorChippingOp<DimId, Expr>, N> {\
- typedef CVQual TensorChippingOp< DimId, typename CalculateIndex <N, Expr>::ArgType> Type;\
-};
-
-CHIPPINGOP(const)
-CHIPPINGOP()
-#undef CHIPPINGOP
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
/// TensorReductionOp
#define SYCLREDUCTION(CVQual)\
template <typename OP, typename Dims, typename Expr, size_t N>\
@@ -179,45 +167,6 @@ SYCLREDUCTION(const)
SYCLREDUCTION()
#undef SYCLREDUCTION
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorReductionOp
-#define SYCLCONTRACTIONCONVOLUTIONPLH(CVQual, ExprNode)\
-template <typename Indices, typename LhsXprType, typename RhsXprType, size_t N>\
-struct PlaceHolderExpression<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>{\
- typedef CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N> Type;\
-};
-SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTIONPLH(,TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorConvolutionOp)
-SYCLCONTRACTIONCONVOLUTIONPLH(,TensorConvolutionOp)
-#undef SYCLCONTRACTIONCONVOLUTIONPLH
-
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorCwiseSelectOp
-#define SLICEOPEXPR(CVQual)\
-template <typename StartIndices, typename Sizes, typename XprType, size_t N>\
-struct PlaceHolderExpression<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, N> {\
- typedef CVQual TensorSlicingOp<StartIndices, Sizes, typename CalculateIndex<N, XprType>::ArgType> Type;\
-};
-
-SLICEOPEXPR(const)
-SLICEOPEXPR()
-#undef SLICEOPEXPR
-
-
-#define SYCLSLICESTRIDEOPPLH(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, size_t N>\
-struct PlaceHolderExpression<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, N> {\
- typedef CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, typename CalculateIndex<N, XprType>::ArgType> Type;\
-};
-
-SYCLSLICESTRIDEOPPLH(const)
-SYCLSLICESTRIDEOPPLH()
-#undef SYCLSLICESTRIDEOPPLH
-
-
/// template deduction for \ref PlaceHolderExpression struct
template <typename Expr>
struct createPlaceHolderExpression {
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
index cac7855..7914b6f 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
@@ -25,70 +25,43 @@
namespace Eigen {
namespace TensorSycl {
-
-template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecExprFunctorKernel{
- typedef typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr;
-
- typedef typename Expr::Index Index;
- FunctorExpr functors;
- TupleType tuple_of_accessors;
- Index range;
- ExecExprFunctorKernel(Index range_, FunctorExpr functors_, TupleType tuple_of_accessors_)
- : functors(functors_), tuple_of_accessors(tuple_of_accessors_), range(range_){}
- void operator()(cl::sycl::nd_item<1> itemID) {
- typedef typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr;
- auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
- typename DevExpr::Index gId = static_cast<typename DevExpr::Index>(itemID.get_global_linear_id());
- if (gId < range)
- device_evaluator.evalScalar(gId);
- }
-};
-
/// The run function in tensor sycl convert the expression tree to a buffer
/// based expression tree;
/// creates the expression tree for the device with accessor to buffers;
/// construct the kernel and submit it to the sycl queue.
-/// std::array does not have TotalSize. So I have to get the size through template specialisation.
-template<typename , typename Dimensions> struct DimensionSize{
- static auto getDimSize(const Dimensions& dim)->decltype(dim.TotalSize()){
- return dim.TotalSize();
- }
-};
-#define DIMSIZEMACRO(CVQual)\
-template<typename Index, size_t NumDims> struct DimensionSize<Index, CVQual std::array<Index, NumDims>>{\
- static inline Index getDimSize(const std::array<Index, NumDims>& dim){\
- return (NumDims == 0) ? 1 : ::Eigen::internal::array_prod(dim);\
- }\
-};
-
-DIMSIZEMACRO(const)
-DIMSIZEMACRO()
-#undef DIMSIZEMACRO
-
-
template <typename Expr, typename Dev>
void run(Expr &expr, Dev &dev) {
Eigen::TensorEvaluator<Expr, Dev> evaluator(expr, dev);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
- typedef Eigen::TensorSycl::internal::FunctorExtractor<Eigen::TensorEvaluator<Expr, Dev> > FunctorExpr;
- FunctorExpr functors = internal::extractFunctors(evaluator);
- dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
- // create a tuple of accessors from Evaluator
- typedef decltype(internal::createTupleOfAccessors<Eigen::TensorEvaluator<Expr, Dev> >(cgh, evaluator)) TupleType;
- TupleType tuple_of_accessors = internal::createTupleOfAccessors<Eigen::TensorEvaluator<Expr, Dev> >(cgh, evaluator);
- typename Expr::Index range, GRange, tileSize;
- typename Expr::Index total_size = static_cast<typename Expr::Index>(DimensionSize<typename Expr::Index, typename Eigen::TensorEvaluator<Expr, Dev>::Dimensions>::getDimSize(evaluator.dimensions()));
- dev.parallel_for_setup(total_size, tileSize, range, GRange);
+ typedef typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr;
+ auto functors = internal::extractFunctors(evaluator);
- cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
- ExecExprFunctorKernel<Expr,FunctorExpr,TupleType>(range
- , functors, tuple_of_accessors
- ));
+ size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
+ dev.m_queue.submit([&](cl::sycl::handler &cgh) {
+
+ // create a tuple of accessors from Evaluator
+ auto tuple_of_accessors = internal::createTupleOfAccessors<decltype(evaluator)>(cgh, evaluator);
+ const auto range = utility::tuple::get<0>(tuple_of_accessors).get_range()[0];
+ size_t GRange=range;
+ if (tileSize>GRange) tileSize=GRange;
+ else if(GRange>tileSize){
+ size_t xMode = GRange % tileSize;
+ if (xMode != 0) GRange += (tileSize - xMode);
+ }
+ // run the kernel
+ cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
+ typedef typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr;
+ auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+ auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
+ if (itemID.get_global_linear_id() < range) {
+ device_evaluator.evalScalar(static_cast<int>(itemID.get_global_linear_id()));
+ }
+ });
});
- dev.asynchronousExec();
+ dev.m_queue.throw_asynchronous();
}
+
evaluator.cleanup();
}
} // namespace TensorSycl
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
index 58ab0f0..063b027 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
@@ -20,7 +20,6 @@
#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
-
namespace utility {
namespace tuple {
/// \struct StaticIf
@@ -232,5 +231,4 @@ Tuple<Args1..., Args2...> append(Tuple<Args1...> t1,Tuple<Args2...> t2) {
}
} // tuple
} // utility
-
#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
index a1e944e..ffcf8b0 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@@ -58,8 +58,6 @@ struct traits<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
};
template <typename T> struct MakePointer {
typedef T* Type;
- typedef T& RefType;
-
};
};
@@ -78,8 +76,6 @@ struct traits<TensorFixedSize<Scalar_, Dimensions, Options_, IndexType_> >
};
template <typename T> struct MakePointer {
typedef T* Type;
- typedef T& RefType;
-
};
};
@@ -102,8 +98,6 @@ struct traits<TensorMap<PlainObjectType, Options_, MakePointer_> >
// Intermediate typedef to workaround MSVC issue.
typedef MakePointer_<T> MakePointerT;
typedef typename MakePointerT::Type Type;
- typedef typename MakePointerT::RefType RefType;
-
};
};
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
index d23f2e4..3523e7c 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@@ -23,7 +23,6 @@ struct static_val {
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) {
- EIGEN_UNUSED_VARIABLE(v);
eigen_assert(v == n);
}
};
diff --git a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
index 9dcc9da..354bce5 100644
--- a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+++ b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@@ -20,13 +20,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
typedef RunQueue<Task, 1024> Queue;
NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment())
- : NonBlockingThreadPoolTempl(num_threads, true, env) {}
-
- NonBlockingThreadPoolTempl(int num_threads, bool allow_spinning,
- Environment env = Environment())
- : num_threads_(num_threads),
- allow_spinning_(allow_spinning),
- env_(env),
+ : env_(env),
threads_(num_threads),
queues_(num_threads),
coprimes_(num_threads),
@@ -34,20 +28,19 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
blocked_(0),
spinning_(0),
done_(false),
- cancelled_(false),
ec_(waiters_) {
- waiters_.resize(num_threads_);
+ waiters_.resize(num_threads);
- // Calculate coprimes of num_threads_.
+ // Calculate coprimes of num_threads.
// Coprimes are used for a random walk over all threads in Steal
// and NonEmptyQueueIndex. Iteration is based on the fact that if we take
// a walk starting thread index t and calculate num_threads - 1 subsequent
// indices as (t + coprime) % num_threads, we will cover all threads without
// repetitions (effectively getting a presudo-random permutation of thread
// indices).
- for (int i = 1; i <= num_threads_; i++) {
+ for (int i = 1; i <= num_threads; i++) {
unsigned a = i;
- unsigned b = num_threads_;
+ unsigned b = num_threads;
// If GCD(a, b) == 1, then a and b are coprimes.
while (b != 0) {
unsigned tmp = a;
@@ -58,33 +51,24 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
coprimes_.push_back(i);
}
}
- for (int i = 0; i < num_threads_; i++) {
+ for (int i = 0; i < num_threads; i++) {
queues_.push_back(new Queue());
}
- for (int i = 0; i < num_threads_; i++) {
+ for (int i = 0; i < num_threads; i++) {
threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); }));
}
}
~NonBlockingThreadPoolTempl() {
done_ = true;
-
// Now if all threads block without work, they will start exiting.
// But note that threads can continue to work arbitrary long,
// block, submit new work, unblock and otherwise live full life.
- if (!cancelled_) {
- ec_.Notify(true);
- } else {
- // Since we were cancelled, there might be entries in the queues.
- // Empty them to prevent their destructor from asserting.
- for (size_t i = 0; i < queues_.size(); i++) {
- queues_[i]->Flush();
- }
- }
+ ec_.Notify(true);
// Join threads explicitly to avoid destruction order issues.
- for (size_t i = 0; i < num_threads_; i++) delete threads_[i];
- for (size_t i = 0; i < num_threads_; i++) delete queues_[i];
+ for (size_t i = 0; i < threads_.size(); i++) delete threads_[i];
+ for (size_t i = 0; i < threads_.size(); i++) delete queues_[i];
}
void Schedule(std::function<void()> fn) {
@@ -107,31 +91,14 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
// completes overall computations, which in turn leads to destruction of
// this. We expect that such scenario is prevented by program, that is,
// this is kept alive while any threads can potentially be in Schedule.
- if (!t.f) {
+ if (!t.f)
ec_.Notify(false);
- }
- else {
+ else
env_.ExecuteTask(t); // Push failed, execute directly.
- }
- }
-
- void Cancel() {
- cancelled_ = true;
- done_ = true;
-
- // Let each thread know it's been cancelled.
-#ifdef EIGEN_THREAD_ENV_SUPPORTS_CANCELLATION
- for (size_t i = 0; i < threads_.size(); i++) {
- threads_[i]->OnCancel();
- }
-#endif
-
- // Wake up the threads without work to let them exit on their own.
- ec_.Notify(true);
}
int NumThreads() const final {
- return num_threads_;
+ return static_cast<int>(threads_.size());
}
int CurrentThreadId() const final {
@@ -155,8 +122,6 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
};
Environment env_;
- const int num_threads_;
- const bool allow_spinning_;
MaxSizeVector<Thread*> threads_;
MaxSizeVector<Queue*> queues_;
MaxSizeVector<unsigned> coprimes_;
@@ -164,7 +129,6 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
std::atomic<unsigned> blocked_;
std::atomic<bool> spinning_;
std::atomic<bool> done_;
- std::atomic<bool> cancelled_;
EventCount ec_;
// Main worker thread loop.
@@ -175,62 +139,32 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
pt->thread_id = thread_id;
Queue* q = queues_[thread_id];
EventCount::Waiter* waiter = &waiters_[thread_id];
- // TODO(dvyukov,rmlarsen): The time spent in Steal() is proportional
- // to num_threads_ and we assume that new work is scheduled at a
- // constant rate, so we set spin_count to 5000 / num_threads_. The
- // constant was picked based on a fair dice roll, tune it.
- const int spin_count =
- allow_spinning_ && num_threads_ > 0 ? 5000 / num_threads_ : 0;
- if (num_threads_ == 1) {
- // For num_threads_ == 1 there is no point in going through the expensive
- // steal loop. Moreover, since Steal() calls PopBack() on the victim
- // queues it might reverse the order in which ops are executed compared to
- // the order in which they are scheduled, which tends to be
- // counter-productive for the types of I/O workloads the single thread
- // pools tend to be used for.
- while (!cancelled_) {
- Task t = q->PopFront();
- for (int i = 0; i < spin_count && !t.f; i++) {
- if (!cancelled_.load(std::memory_order_relaxed)) {
- t = q->PopFront();
- }
- }
+ for (;;) {
+ Task t = q->PopFront();
+ if (!t.f) {
+ t = Steal();
if (!t.f) {
- if (!WaitForWork(waiter, &t)) {
- return;
+ // Leave one thread spinning. This reduces latency.
+ // TODO(dvyukov): 1000 iterations is based on fair dice roll, tune it.
+ // Also, the time it takes to attempt to steal work 1000 times depends
+ // on the size of the thread pool. However the speed at which the user
+ // of the thread pool submit tasks is independent of the size of the
+ // pool. Consider a time based limit instead.
+ if (!spinning_ && !spinning_.exchange(true)) {
+ for (int i = 0; i < 1000 && !t.f; i++) {
+ t = Steal();
+ }
+ spinning_ = false;
}
- }
- if (t.f) {
- env_.ExecuteTask(t);
- }
- }
- } else {
- while (!cancelled_) {
- Task t = q->PopFront();
- if (!t.f) {
- t = Steal();
if (!t.f) {
- // Leave one thread spinning. This reduces latency.
- if (allow_spinning_ && !spinning_ && !spinning_.exchange(true)) {
- for (int i = 0; i < spin_count && !t.f; i++) {
- if (!cancelled_.load(std::memory_order_relaxed)) {
- t = Steal();
- } else {
- return;
- }
- }
- spinning_ = false;
- }
- if (!t.f) {
- if (!WaitForWork(waiter, &t)) {
- return;
- }
+ if (!WaitForWork(waiter, &t)) {
+ return;
}
}
}
- if (t.f) {
- env_.ExecuteTask(t);
- }
+ }
+ if (t.f) {
+ env_.ExecuteTask(t);
}
}
}
@@ -267,18 +201,14 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
int victim = NonEmptyQueueIndex();
if (victim != -1) {
ec_.CancelWait(waiter);
- if (cancelled_) {
- return false;
- } else {
- *t = queues_[victim]->PopBack();
- return true;
- }
+ *t = queues_[victim]->PopBack();
+ return true;
}
// Number of blocked threads is used as termination condition.
// If we are shutting down and all worker threads blocked without work,
// that's we are done.
blocked_++;
- if (done_ && blocked_ == num_threads_) {
+ if (done_ && blocked_ == threads_.size()) {
ec_.CancelWait(waiter);
// Almost done, but need to re-check queues.
// Consider that all queues are empty and all worker threads are preempted
diff --git a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
index 49d0cdc..05ed76c 100644
--- a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
+++ b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
@@ -177,13 +177,6 @@ class RunQueue {
// Can be called by any thread at any time.
bool Empty() const { return Size() == 0; }
- // Delete all the elements from the queue.
- void Flush() {
- while (!Empty()) {
- PopFront();
- }
- }
-
private:
static const unsigned kMask = kSize - 1;
static const unsigned kMask2 = (kSize << 1) - 1;
diff --git a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
index 3357286..e75d0f4 100644
--- a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
+++ b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
@@ -69,14 +69,6 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface {
}
}
- void Cancel() {
-#ifdef EIGEN_THREAD_ENV_SUPPORTS_CANCELLATION
- for (size_t i = 0; i < threads_.size(); i++) {
- threads_[i]->OnCancel();
- }
-#endif
- }
-
int NumThreads() const final {
return static_cast<int>(threads_.size());
}
diff --git a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h
deleted file mode 100644
index a05685f..0000000
--- a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h
+++ /dev/null
@@ -1,23 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
-#define EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
-
-// Try to come up with a portable way to cancel a thread
-#if EIGEN_OS_GNULINUX
- #define EIGEN_THREAD_CANCEL(t) \
- pthread_cancel(t.native_handle());
- #define EIGEN_SUPPORTS_THREAD_CANCELLATION 1
-#else
-#define EIGEN_THREAD_CANCEL(t)
-#endif
-
-
-#endif // EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
diff --git a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
index d94a064..399f95c 100644
--- a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
+++ b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
@@ -23,8 +23,6 @@ struct StlThreadEnvironment {
public:
EnvThread(std::function<void()> f) : thr_(std::move(f)) {}
~EnvThread() { thr_.join(); }
- // This function is called when the threadpool is cancelled.
- void OnCancel() { }
private:
std::thread thr_;
diff --git a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
index 84e1e6c..a65ee97 100644
--- a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
+++ b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
@@ -16,14 +16,8 @@ namespace Eigen {
// custom thread pools underneath.
class ThreadPoolInterface {
public:
- // Submits a closure to be run by a thread in the pool.
virtual void Schedule(std::function<void()> fn) = 0;
- // If implemented, stop processing the closures that have been enqueued.
- // Currently running closures may still be processed.
- // If not implemented, does nothing.
- virtual void Cancel() {}
-
// Returns the number of threads in the pool.
virtual int NumThreads() const = 0;
diff --git a/eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h b/eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
index 49d315a..ec27edd 100644
--- a/eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
+++ b/eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
@@ -40,7 +40,7 @@ template<typename T, T... nn>
struct numeric_list { constexpr static std::size_t count = sizeof...(nn); };
template<typename T, T n, T... nn>
-struct numeric_list<T, n, nn...> { static const std::size_t count = sizeof...(nn) + 1; const static T first_value = n; };
+struct numeric_list<T, n, nn...> { constexpr static std::size_t count = sizeof...(nn) + 1; constexpr static T first_value = n; };
/* numeric list constructors
*
@@ -123,10 +123,6 @@ template<typename a, typename... as> struct get<0, type_lis
template<typename T, int n, T a, T... as> struct get<n, numeric_list<T, a, as...>> : get<n-1, numeric_list<T, as...>> {};
template<typename T, T a, T... as> struct get<0, numeric_list<T, a, as...>> { constexpr static T value = a; };
-template<std::size_t n, typename T, T a, T... as> constexpr T array_get(const numeric_list<T, a, as...>&) {
- return get<(int)n, numeric_list<T, a, as...>>::value;
-}
-
/* always get type, regardless of dummy; good for parameter pack expansion */
template<typename T, T dummy, typename t> struct id_numeric { typedef t type; };
diff --git a/eigen/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/eigen/unsupported/Eigen/CXX11/src/util/EmulateArray.h
index 573ca43..30d3ebc 100644
--- a/eigen/unsupported/Eigen/CXX11/src/util/EmulateArray.h
+++ b/eigen/unsupported/Eigen/CXX11/src/util/EmulateArray.h
@@ -169,7 +169,6 @@ template <typename T> class array<T, 0> {
#if EIGEN_HAS_VARIADIC_TEMPLATES
EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() {
- EIGEN_UNUSED_VARIABLE(l);
eigen_assert(l.size() == 0);
}
#endif
@@ -201,15 +200,19 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T,N>& a) {
return a[I];
}
+template <typename T> struct array_size;
template<class T, std::size_t N> struct array_size<array<T,N> > {
static const size_t value = N;
};
+template <typename T> struct array_size;
template<class T, std::size_t N> struct array_size<array<T,N>& > {
static const size_t value = N;
};
+template <typename T> struct array_size;
template<class T, std::size_t N> struct array_size<const array<T,N> > {
static const size_t value = N;
};
+template <typename T> struct array_size;
template<class T, std::size_t N> struct array_size<const array<T,N>& > {
static const size_t value = N;
};
@@ -248,6 +251,14 @@ template<std::size_t I, class T, std::size_t N> constexpr inline T const& array_
#undef STD_GET_ARR_HACK
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<const std::array<T,N> > {
+ static const size_t value = N;
+};
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<std::array<T,N> > {
+ static const size_t value = N;
+};
} // end namespace internal
} // end namespace Eigen
diff --git a/eigen/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/eigen/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
index d280886..279fe5c 100644
--- a/eigen/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
+++ b/eigen/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
@@ -683,4 +683,11 @@ template<typename DerType> struct NumTraits<AutoDiffScalar<DerType> >
}
+namespace std {
+template <typename T>
+class numeric_limits<Eigen::AutoDiffScalar<T> >
+ : public numeric_limits<typename T::Scalar> {};
+
+} // namespace std
+
#endif // EIGEN_AUTODIFF_SCALAR_H
diff --git a/eigen/unsupported/Eigen/src/EulerAngles/EulerAngles.h b/eigen/unsupported/Eigen/src/EulerAngles/EulerAngles.h
index a5d034d..13a0da1 100644
--- a/eigen/unsupported/Eigen/src/EulerAngles/EulerAngles.h
+++ b/eigen/unsupported/Eigen/src/EulerAngles/EulerAngles.h
@@ -12,6 +12,11 @@
namespace Eigen
{
+ /*template<typename Other,
+ int OtherRows=Other::RowsAtCompileTime,
+ int OtherCols=Other::ColsAtCompileTime>
+ struct ei_eulerangles_assign_impl;*/
+
/** \class EulerAngles
*
* \ingroup EulerAngles_Module
@@ -31,7 +36,7 @@ namespace Eigen
* ### Rotation representation and conversions ###
*
* It has been proved(see Wikipedia link below) that every rotation can be represented
- * by Euler angles, but there is no single representation (e.g. unlike rotation matrices).
+ * by Euler angles, but there is no singular representation (e.g. unlike rotation matrices).
* Therefore, you can convert from Eigen rotation and to them
* (including rotation matrices, which is not called "rotations" by Eigen design).
*
@@ -50,27 +55,33 @@ namespace Eigen
* Additionally, some axes related computation is done in compile time.
*
* #### Euler angles ranges in conversions ####
- * Rotations representation as EulerAngles are not single (unlike matrices),
- * and even have infinite EulerAngles representations.<BR>
- * For example, add or subtract 2*PI from either angle of EulerAngles
- * and you'll get the same rotation.
- * This is the general reason for infinite representation,
- * but it's not the only general reason for not having a single representation.
*
- * When converting rotation to EulerAngles, this class convert it to specific ranges
- * When converting some rotation to EulerAngles, the rules for ranges are as follow:
- * - If the rotation we converting from is an EulerAngles
- * (even when it represented as RotationBase explicitly), angles ranges are __undefined__.
- * - otherwise, alpha and gamma angles will be in the range [-PI, PI].<BR>
- * As for Beta angle:
- * - If the system is Tait-Bryan, the beta angle will be in the range [-PI/2, PI/2].
- * - otherwise:
- * - If the beta axis is positive, the beta angle will be in the range [0, PI]
- * - If the beta axis is negative, the beta angle will be in the range [-PI, 0]
+ * When converting some rotation to Euler angles, there are some ways you can guarantee
+ * the Euler angles ranges.
*
+ * #### implicit ranges ####
+ * When using implicit ranges, all angles are guarantee to be in the range [-PI, +PI],
+ * unless you convert from some other Euler angles.
+ * In this case, the range is __undefined__ (might be even less than -PI or greater than +2*PI).
* \sa EulerAngles(const MatrixBase<Derived>&)
* \sa EulerAngles(const RotationBase<Derived, 3>&)
*
+ * #### explicit ranges ####
+ * When using explicit ranges, all angles are guarantee to be in the range you choose.
+ * In the range Boolean parameter, you're been ask whether you prefer the positive range or not:
+ * - _true_ - force the range between [0, +2*PI]
+ * - _false_ - force the range between [-PI, +PI]
+ *
+ * ##### compile time ranges #####
+ * This is when you have compile time ranges and you prefer to
+ * use template parameter. (e.g. for performance)
+ * \sa FromRotation()
+ *
+ * ##### run-time time ranges #####
+ * Run-time ranges are also supported.
+ * \sa EulerAngles(const MatrixBase<Derived>&, bool, bool, bool)
+ * \sa EulerAngles(const RotationBase<Derived, 3>&, bool, bool, bool)
+ *
* ### Convenient user typedefs ###
*
* Convenient typedefs for EulerAngles exist for float and double scalar,
@@ -92,7 +103,7 @@ namespace Eigen
*
* More information about Euler angles: https://en.wikipedia.org/wiki/Euler_angles
*
- * \tparam _Scalar the scalar type, i.e. the type of the angles.
+ * \tparam _Scalar the scalar type, i.e., the type of the angles.
*
* \tparam _System the EulerSystem to use, which represents the axes of rotation.
*/
@@ -100,11 +111,8 @@ namespace Eigen
class EulerAngles : public RotationBase<EulerAngles<_Scalar, _System>, 3>
{
public:
- typedef RotationBase<EulerAngles<_Scalar, _System>, 3> Base;
-
/** the scalar type of the angles */
typedef _Scalar Scalar;
- typedef typename NumTraits<Scalar>::Real RealScalar;
/** the EulerSystem to use, which represents the axes of rotation. */
typedef _System System;
@@ -138,56 +146,67 @@ namespace Eigen
public:
/** Default constructor without initialization. */
EulerAngles() {}
- /** Constructs and initialize an EulerAngles (\p alpha, \p beta, \p gamma). */
+ /** Constructs and initialize Euler angles(\p alpha, \p beta, \p gamma). */
EulerAngles(const Scalar& alpha, const Scalar& beta, const Scalar& gamma) :
m_angles(alpha, beta, gamma) {}
- // TODO: Test this constructor
- /** Constructs and initialize an EulerAngles from the array data {alpha, beta, gamma} */
- explicit EulerAngles(const Scalar* data) : m_angles(data) {}
+ /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m.
+ *
+ * \note All angles will be in the range [-PI, PI].
+ */
+ template<typename Derived>
+ EulerAngles(const MatrixBase<Derived>& m) { *this = m; }
- /** Constructs and initializes an EulerAngles from either:
- * - a 3x3 rotation matrix expression(i.e. pure orthogonal matrix with determinant of +1),
- * - a 3D vector expression representing Euler angles.
+ /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m,
+ * with options to choose for each angle the requested range.
+ *
+ * If positive range is true, then the specified angle will be in the range [0, +2*PI].
+ * Otherwise, the specified angle will be in the range [-PI, +PI].
*
- * \note If \p other is a 3x3 rotation matrix, the angles range rules will be as follow:<BR>
- * Alpha and gamma angles will be in the range [-PI, PI].<BR>
- * As for Beta angle:
- * - If the system is Tait-Bryan, the beta angle will be in the range [-PI/2, PI/2].
- * - otherwise:
- * - If the beta axis is positive, the beta angle will be in the range [0, PI]
- * - If the beta axis is negative, the beta angle will be in the range [-PI, 0]
- */
+ * \param m The 3x3 rotation matrix to convert
+ * \param positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+ * \param positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+ * \param positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+ */
template<typename Derived>
- explicit EulerAngles(const MatrixBase<Derived>& other) { *this = other; }
+ EulerAngles(
+ const MatrixBase<Derived>& m,
+ bool positiveRangeAlpha,
+ bool positiveRangeBeta,
+ bool positiveRangeGamma) {
+
+ System::CalcEulerAngles(*this, m, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma);
+ }
/** Constructs and initialize Euler angles from a rotation \p rot.
*
- * \note If \p rot is an EulerAngles (even when it represented as RotationBase explicitly),
- * angles ranges are __undefined__.
- * Otherwise, alpha and gamma angles will be in the range [-PI, PI].<BR>
- * As for Beta angle:
- * - If the system is Tait-Bryan, the beta angle will be in the range [-PI/2, PI/2].
- * - otherwise:
- * - If the beta axis is positive, the beta angle will be in the range [0, PI]
- * - If the beta axis is negative, the beta angle will be in the range [-PI, 0]
+ * \note All angles will be in the range [-PI, PI], unless \p rot is an EulerAngles.
+ * If rot is an EulerAngles, expected EulerAngles range is __undefined__.
+ * (Use other functions here for enforcing range if this effect is desired)
*/
template<typename Derived>
- EulerAngles(const RotationBase<Derived, 3>& rot) { System::CalcEulerAngles(*this, rot.toRotationMatrix()); }
+ EulerAngles(const RotationBase<Derived, 3>& rot) { *this = rot; }
- /*EulerAngles(const QuaternionType& q)
- {
- // TODO: Implement it in a faster way for quaternions
- // According to http://www.euclideanspace.com/maths/geometry/rotations/conversions/quaternionToEuler/
- // we can compute only the needed matrix cells and then convert to euler angles. (see ZYX example below)
- // Currently we compute all matrix cells from quaternion.
-
- // Special case only for ZYX
- //Scalar y2 = q.y() * q.y();
- //m_angles[0] = std::atan2(2*(q.w()*q.z() + q.x()*q.y()), (1 - 2*(y2 + q.z()*q.z())));
- //m_angles[1] = std::asin( 2*(q.w()*q.y() - q.z()*q.x()));
- //m_angles[2] = std::atan2(2*(q.w()*q.x() + q.y()*q.z()), (1 - 2*(q.x()*q.x() + y2)));
- }*/
+ /** Constructs and initialize Euler angles from a rotation \p rot,
+ * with options to choose for each angle the requested range.
+ *
+ * If positive range is true, then the specified angle will be in the range [0, +2*PI].
+ * Otherwise, the specified angle will be in the range [-PI, +PI].
+ *
+ * \param rot The 3x3 rotation matrix to convert
+ * \param positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+ * \param positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+ * \param positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+ */
+ template<typename Derived>
+ EulerAngles(
+ const RotationBase<Derived, 3>& rot,
+ bool positiveRangeAlpha,
+ bool positiveRangeBeta,
+ bool positiveRangeGamma) {
+
+ System::CalcEulerAngles(*this, rot.toRotationMatrix(), positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma);
+ }
/** \returns The angle values stored in a vector (alpha, beta, gamma). */
const Vector3& angles() const { return m_angles; }
@@ -227,48 +246,90 @@ namespace Eigen
return inverse();
}
- /** Set \c *this from either:
- * - a 3x3 rotation matrix expression(i.e. pure orthogonal matrix with determinant of +1),
- * - a 3D vector expression representing Euler angles.
+ /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m,
+ * with options to choose for each angle the requested range (__only in compile time__).
*
- * See EulerAngles(const MatrixBase<Derived, 3>&) for more information about
- * angles ranges output.
+ * If positive range is true, then the specified angle will be in the range [0, +2*PI].
+ * Otherwise, the specified angle will be in the range [-PI, +PI].
+ *
+ * \param m The 3x3 rotation matrix to convert
+ * \tparam positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+ * \tparam positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+ * \tparam positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+ */
+ template<
+ bool PositiveRangeAlpha,
+ bool PositiveRangeBeta,
+ bool PositiveRangeGamma,
+ typename Derived>
+ static EulerAngles FromRotation(const MatrixBase<Derived>& m)
+ {
+ EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived, 3, 3)
+
+ EulerAngles e;
+ System::template CalcEulerAngles<
+ PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma, _Scalar>(e, m);
+ return e;
+ }
+
+ /** Constructs and initialize Euler angles from a rotation \p rot,
+ * with options to choose for each angle the requested range (__only in compile time__).
+ *
+ * If positive range is true, then the specified angle will be in the range [0, +2*PI].
+ * Otherwise, the specified angle will be in the range [-PI, +PI].
+ *
+ * \param rot The 3x3 rotation matrix to convert
+ * \tparam positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+ * \tparam positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+ * \tparam positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
*/
- template<class Derived>
- EulerAngles& operator=(const MatrixBase<Derived>& other)
+ template<
+ bool PositiveRangeAlpha,
+ bool PositiveRangeBeta,
+ bool PositiveRangeGamma,
+ typename Derived>
+ static EulerAngles FromRotation(const RotationBase<Derived, 3>& rot)
+ {
+ return FromRotation<PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma>(rot.toRotationMatrix());
+ }
+
+ /*EulerAngles& fromQuaternion(const QuaternionType& q)
{
- EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename Derived::Scalar>::value),
- YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+ // TODO: Implement it in a faster way for quaternions
+ // According to http://www.euclideanspace.com/maths/geometry/rotations/conversions/quaternionToEuler/
+ // we can compute only the needed matrix cells and then convert to euler angles. (see ZYX example below)
+ // Currently we compute all matrix cells from quaternion.
+
+ // Special case only for ZYX
+ //Scalar y2 = q.y() * q.y();
+ //m_angles[0] = std::atan2(2*(q.w()*q.z() + q.x()*q.y()), (1 - 2*(y2 + q.z()*q.z())));
+ //m_angles[1] = std::asin( 2*(q.w()*q.y() - q.z()*q.x()));
+ //m_angles[2] = std::atan2(2*(q.w()*q.x() + q.y()*q.z()), (1 - 2*(q.x()*q.x() + y2)));
+ }*/
+
+ /** Set \c *this from a rotation matrix(i.e. pure orthogonal matrix with determinant of +1). */
+ template<typename Derived>
+ EulerAngles& operator=(const MatrixBase<Derived>& m) {
+ EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived, 3, 3)
- internal::eulerangles_assign_impl<System, Derived>::run(*this, other.derived());
+ System::CalcEulerAngles(*this, m);
return *this;
}
// TODO: Assign and construct from another EulerAngles (with different system)
- /** Set \c *this from a rotation.
- *
- * See EulerAngles(const RotationBase<Derived, 3>&) for more information about
- * angles ranges output.
- */
+ /** Set \c *this from a rotation. */
template<typename Derived>
EulerAngles& operator=(const RotationBase<Derived, 3>& rot) {
System::CalcEulerAngles(*this, rot.toRotationMatrix());
return *this;
}
- /** \returns \c true if \c *this is approximately equal to \a other, within the precision
- * determined by \a prec.
- *
- * \sa MatrixBase::isApprox() */
- bool isApprox(const EulerAngles& other,
- const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const
- { return angles().isApprox(other.angles(), prec); }
+ // TODO: Support isApprox function
/** \returns an equivalent 3x3 rotation matrix. */
Matrix3 toRotationMatrix() const
{
- // TODO: Calc it faster
return static_cast<QuaternionType>(*this).toRotationMatrix();
}
@@ -286,15 +347,6 @@ namespace Eigen
s << eulerAngles.angles().transpose();
return s;
}
-
- /** \returns \c *this with scalar type casted to \a NewScalarType */
- template <typename NewScalarType>
- EulerAngles<NewScalarType, System> cast() const
- {
- EulerAngles<NewScalarType, System> e;
- e.angles() = angles().template cast<NewScalarType>();
- return e;
- }
};
#define EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(AXES, SCALAR_TYPE, SCALAR_POSTFIX) \
@@ -327,29 +379,8 @@ EIGEN_EULER_ANGLES_TYPEDEFS(double, d)
{
typedef _Scalar Scalar;
};
-
- // set from a rotation matrix
- template<class System, class Other>
- struct eulerangles_assign_impl<System,Other,3,3>
- {
- typedef typename Other::Scalar Scalar;
- static void run(EulerAngles<Scalar, System>& e, const Other& m)
- {
- System::CalcEulerAngles(e, m);
- }
- };
-
- // set from a vector of Euler angles
- template<class System, class Other>
- struct eulerangles_assign_impl<System,Other,4,1>
- {
- typedef typename Other::Scalar Scalar;
- static void run(EulerAngles<Scalar, System>& e, const Other& vec)
- {
- e.angles() = vec;
- }
- };
}
+
}
#endif // EIGEN_EULERANGLESCLASS_H
diff --git a/eigen/unsupported/Eigen/src/EulerAngles/EulerSystem.h b/eigen/unsupported/Eigen/src/EulerAngles/EulerSystem.h
index 28f52da..98f9f64 100644
--- a/eigen/unsupported/Eigen/src/EulerAngles/EulerSystem.h
+++ b/eigen/unsupported/Eigen/src/EulerAngles/EulerSystem.h
@@ -18,7 +18,7 @@ namespace Eigen
namespace internal
{
- // TODO: Add this trait to the Eigen internal API?
+ // TODO: Check if already exists on the rest API
template <int Num, bool IsPositive = (Num > 0)>
struct Abs
{
@@ -36,12 +36,6 @@ namespace Eigen
{
enum { value = Axis != 0 && Abs<Axis>::value <= 3 };
};
-
- template<typename System,
- typename Other,
- int OtherRows=Other::RowsAtCompileTime,
- int OtherCols=Other::ColsAtCompileTime>
- struct eulerangles_assign_impl;
}
#define EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(COND,MSG) typedef char static_assertion_##MSG[(COND)?1:-1]
@@ -75,7 +69,7 @@ namespace Eigen
*
* You can use this class to get two things:
* - Build an Euler system, and then pass it as a template parameter to EulerAngles.
- * - Query some compile time data about an Euler system. (e.g. Whether it's Tait-Bryan)
+ * - Query some compile time data about an Euler system. (e.g. Whether it's tait bryan)
*
* Euler rotation is a set of three rotation on fixed axes. (see \ref EulerAngles)
* This meta-class store constantly those signed axes. (see \ref EulerAxis)
@@ -86,7 +80,7 @@ namespace Eigen
* signed axes{+X,+Y,+Z,-X,-Y,-Z} are supported:
* - all axes X, Y, Z in each valid order (see below what order is valid)
* - rotation over the axis is supported both over the positive and negative directions.
- * - both Tait-Bryan and proper/classic Euler angles (i.e. the opposite).
+ * - both tait bryan and proper/classic Euler angles (i.e. the opposite).
*
* Since EulerSystem support both positive and negative directions,
* you may call this rotation distinction in other names:
@@ -96,7 +90,7 @@ namespace Eigen
* Notice all axed combination are valid, and would trigger a static assertion.
* Same unsigned axes can't be neighbors, e.g. {X,X,Y} is invalid.
* This yield two and only two classes:
- * - _Tait-Bryan_ - all unsigned axes are distinct, e.g. {X,Y,Z}
+ * - _tait bryan_ - all unsigned axes are distinct, e.g. {X,Y,Z}
* - _proper/classic Euler angles_ - The first and the third unsigned axes is equal,
* and the second is different, e.g. {X,Y,X}
*
@@ -118,9 +112,9 @@ namespace Eigen
*
* \tparam _AlphaAxis the first fixed EulerAxis
*
- * \tparam _BetaAxis the second fixed EulerAxis
+ * \tparam _AlphaAxis the second fixed EulerAxis
*
- * \tparam _GammaAxis the third fixed EulerAxis
+ * \tparam _AlphaAxis the third fixed EulerAxis
*/
template <int _AlphaAxis, int _BetaAxis, int _GammaAxis>
class EulerSystem
@@ -144,16 +138,14 @@ namespace Eigen
BetaAxisAbs = internal::Abs<BetaAxis>::value, /*!< the second rotation axis unsigned */
GammaAxisAbs = internal::Abs<GammaAxis>::value, /*!< the third rotation axis unsigned */
- IsAlphaOpposite = (AlphaAxis < 0) ? 1 : 0, /*!< whether alpha axis is negative */
- IsBetaOpposite = (BetaAxis < 0) ? 1 : 0, /*!< whether beta axis is negative */
- IsGammaOpposite = (GammaAxis < 0) ? 1 : 0, /*!< whether gamma axis is negative */
-
- // Parity is even if alpha axis X is followed by beta axis Y, or Y is followed
- // by Z, or Z is followed by X; otherwise it is odd.
- IsOdd = ((AlphaAxisAbs)%3 == (BetaAxisAbs - 1)%3) ? 0 : 1, /*!< whether the Euler system is odd */
- IsEven = IsOdd ? 0 : 1, /*!< whether the Euler system is even */
+ IsAlphaOpposite = (AlphaAxis < 0) ? 1 : 0, /*!< weather alpha axis is negative */
+ IsBetaOpposite = (BetaAxis < 0) ? 1 : 0, /*!< weather beta axis is negative */
+ IsGammaOpposite = (GammaAxis < 0) ? 1 : 0, /*!< weather gamma axis is negative */
+
+ IsOdd = ((AlphaAxisAbs)%3 == (BetaAxisAbs - 1)%3) ? 0 : 1, /*!< weather the Euler system is odd */
+ IsEven = IsOdd ? 0 : 1, /*!< weather the Euler system is even */
- IsTaitBryan = ((unsigned)AlphaAxisAbs != (unsigned)GammaAxisAbs) ? 1 : 0 /*!< whether the Euler system is Tait-Bryan */
+ IsTaitBryan = ((unsigned)AlphaAxisAbs != (unsigned)GammaAxisAbs) ? 1 : 0 /*!< weather the Euler system is tait bryan */
};
private:
@@ -188,70 +180,71 @@ namespace Eigen
static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar, 3, 1>& res, const MatrixBase<Derived>& mat, internal::true_type /*isTaitBryan*/)
{
using std::atan2;
- using std::sqrt;
+ using std::sin;
+ using std::cos;
typedef typename Derived::Scalar Scalar;
-
- const Scalar plusMinus = IsEven? 1 : -1;
- const Scalar minusPlus = IsOdd? 1 : -1;
-
- const Scalar Rsum = sqrt((mat(I,I) * mat(I,I) + mat(I,J) * mat(I,J) + mat(J,K) * mat(J,K) + mat(K,K) * mat(K,K))/2);
- res[1] = atan2(plusMinus * mat(I,K), Rsum);
-
- // There is a singularity when cos(beta) == 0
- if(Rsum > 4 * NumTraits<Scalar>::epsilon()) {// cos(beta) != 0
- res[0] = atan2(minusPlus * mat(J, K), mat(K, K));
- res[2] = atan2(minusPlus * mat(I, J), mat(I, I));
- }
- else if(plusMinus * mat(I, K) > 0) {// cos(beta) == 0 and sin(beta) == 1
- Scalar spos = mat(J, I) + plusMinus * mat(K, J); // 2*sin(alpha + plusMinus * gamma
- Scalar cpos = mat(J, J) + minusPlus * mat(K, I); // 2*cos(alpha + plusMinus * gamma)
- Scalar alphaPlusMinusGamma = atan2(spos, cpos);
- res[0] = alphaPlusMinusGamma;
- res[2] = 0;
- }
- else {// cos(beta) == 0 and sin(beta) == -1
- Scalar sneg = plusMinus * (mat(K, J) + minusPlus * mat(J, I)); // 2*sin(alpha + minusPlus*gamma)
- Scalar cneg = mat(J, J) + plusMinus * mat(K, I); // 2*cos(alpha + minusPlus*gamma)
- Scalar alphaMinusPlusBeta = atan2(sneg, cneg);
- res[0] = alphaMinusPlusBeta;
- res[2] = 0;
+ typedef Matrix<Scalar,2,1> Vector2;
+
+ res[0] = atan2(mat(J,K), mat(K,K));
+ Scalar c2 = Vector2(mat(I,I), mat(I,J)).norm();
+ if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0))) {
+ if(res[0] > Scalar(0)) {
+ res[0] -= Scalar(EIGEN_PI);
+ }
+ else {
+ res[0] += Scalar(EIGEN_PI);
+ }
+ res[1] = atan2(-mat(I,K), -c2);
}
+ else
+ res[1] = atan2(-mat(I,K), c2);
+ Scalar s1 = sin(res[0]);
+ Scalar c1 = cos(res[0]);
+ res[2] = atan2(s1*mat(K,I)-c1*mat(J,I), c1*mat(J,J) - s1 * mat(K,J));
}
template <typename Derived>
- static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar,3,1>& res,
- const MatrixBase<Derived>& mat, internal::false_type /*isTaitBryan*/)
+ static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar,3,1>& res, const MatrixBase<Derived>& mat, internal::false_type /*isTaitBryan*/)
{
using std::atan2;
- using std::sqrt;
+ using std::sin;
+ using std::cos;
typedef typename Derived::Scalar Scalar;
-
- const Scalar plusMinus = IsEven? 1 : -1;
- const Scalar minusPlus = IsOdd? 1 : -1;
-
- const Scalar Rsum = sqrt((mat(I, J) * mat(I, J) + mat(I, K) * mat(I, K) + mat(J, I) * mat(J, I) + mat(K, I) * mat(K, I)) / 2);
-
- res[1] = atan2(Rsum, mat(I, I));
-
- // There is a singularity when sin(beta) == 0
- if(Rsum > 4 * NumTraits<Scalar>::epsilon()) {// sin(beta) != 0
- res[0] = atan2(mat(J, I), minusPlus * mat(K, I));
- res[2] = atan2(mat(I, J), plusMinus * mat(I, K));
- }
- else if(mat(I, I) > 0) {// sin(beta) == 0 and cos(beta) == 1
- Scalar spos = plusMinus * mat(K, J) + minusPlus * mat(J, K); // 2*sin(alpha + gamma)
- Scalar cpos = mat(J, J) + mat(K, K); // 2*cos(alpha + gamma)
- res[0] = atan2(spos, cpos);
- res[2] = 0;
+ typedef Matrix<Scalar,2,1> Vector2;
+
+ res[0] = atan2(mat(J,I), mat(K,I));
+ if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0)))
+ {
+ if(res[0] > Scalar(0)) {
+ res[0] -= Scalar(EIGEN_PI);
+ }
+ else {
+ res[0] += Scalar(EIGEN_PI);
+ }
+ Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm();
+ res[1] = -atan2(s2, mat(I,I));
}
- else {// sin(beta) == 0 and cos(beta) == -1
- Scalar sneg = plusMinus * mat(K, J) + plusMinus * mat(J, K); // 2*sin(alpha - gamma)
- Scalar cneg = mat(J, J) - mat(K, K); // 2*cos(alpha - gamma)
- res[0] = atan2(sneg, cneg);
- res[2] = 0;
+ else
+ {
+ Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm();
+ res[1] = atan2(s2, mat(I,I));
}
+
+ // With a=(0,1,0), we have i=0; j=1; k=2, and after computing the first two angles,
+ // we can compute their respective rotation, and apply its inverse to M. Since the result must
+ // be a rotation around x, we have:
+ //
+ // c2 s1.s2 c1.s2 1 0 0
+ // 0 c1 -s1 * M = 0 c3 s3
+ // -s2 s1.c2 c1.c2 0 -s3 c3
+ //
+ // Thus: m11.c1 - m21.s1 = c3 & m12.c1 - m22.s1 = s3
+
+ Scalar s1 = sin(res[0]);
+ Scalar c1 = cos(res[0]);
+ res[2] = atan2(c1*mat(J,K)-s1*mat(K,K), c1*mat(J,J) - s1 * mat(K,J));
}
template<typename Scalar>
@@ -259,28 +252,55 @@ namespace Eigen
EulerAngles<Scalar, EulerSystem>& res,
const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat)
{
+ CalcEulerAngles(res, mat, false, false, false);
+ }
+
+ template<
+ bool PositiveRangeAlpha,
+ bool PositiveRangeBeta,
+ bool PositiveRangeGamma,
+ typename Scalar>
+ static void CalcEulerAngles(
+ EulerAngles<Scalar, EulerSystem>& res,
+ const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat)
+ {
+ CalcEulerAngles(res, mat, PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma);
+ }
+
+ template<typename Scalar>
+ static void CalcEulerAngles(
+ EulerAngles<Scalar, EulerSystem>& res,
+ const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat,
+ bool PositiveRangeAlpha,
+ bool PositiveRangeBeta,
+ bool PositiveRangeGamma)
+ {
CalcEulerAngles_imp(
res.angles(), mat,
typename internal::conditional<IsTaitBryan, internal::true_type, internal::false_type>::type());
- if (IsAlphaOpposite)
+ if (IsAlphaOpposite == IsOdd)
res.alpha() = -res.alpha();
- if (IsBetaOpposite)
+ if (IsBetaOpposite == IsOdd)
res.beta() = -res.beta();
- if (IsGammaOpposite)
+ if (IsGammaOpposite == IsOdd)
res.gamma() = -res.gamma();
+
+ // Saturate results to the requested range
+ if (PositiveRangeAlpha && (res.alpha() < 0))
+ res.alpha() += Scalar(2 * EIGEN_PI);
+
+ if (PositiveRangeBeta && (res.beta() < 0))
+ res.beta() += Scalar(2 * EIGEN_PI);
+
+ if (PositiveRangeGamma && (res.gamma() < 0))
+ res.gamma() += Scalar(2 * EIGEN_PI);
}
template <typename _Scalar, class _System>
friend class Eigen::EulerAngles;
-
- template<typename System,
- typename Other,
- int OtherRows,
- int OtherCols>
- friend struct internal::eulerangles_assign_impl;
};
#define EIGEN_EULER_SYSTEM_TYPEDEF(A, B, C) \
diff --git a/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h b/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
index db2449d..3f7d777 100644
--- a/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
+++ b/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
@@ -398,8 +398,8 @@ struct matrix_function_compute
template <typename MatrixType>
struct matrix_function_compute<MatrixType, 0>
{
- template <typename AtomicType, typename ResultType>
- static void run(const MatrixType& A, AtomicType& atomic, ResultType &result)
+ template <typename MatA, typename AtomicType, typename ResultType>
+ static void run(const MatA& A, AtomicType& atomic, ResultType &result)
{
typedef internal::traits<MatrixType> Traits;
typedef typename Traits::Scalar Scalar;
@@ -422,11 +422,10 @@ struct matrix_function_compute<MatrixType, 0>
template <typename MatrixType>
struct matrix_function_compute<MatrixType, 1>
{
- template <typename AtomicType, typename ResultType>
- static void run(const MatrixType& A, AtomicType& atomic, ResultType &result)
+ template <typename MatA, typename AtomicType, typename ResultType>
+ static void run(const MatA& A, AtomicType& atomic, ResultType &result)
{
typedef internal::traits<MatrixType> Traits;
- typedef typename MatrixType::Index Index;
// compute Schur decomposition of A
const ComplexSchur<MatrixType> schurOfA(A);
@@ -514,7 +513,7 @@ template<typename Derived> class MatrixFunctionReturnValue
typedef internal::MatrixFunctionAtomic<DynMatrixType> AtomicType;
AtomicType atomic(m_f);
- internal::matrix_function_compute<NestedEvalTypeClean>::run(m_A, atomic, result);
+ internal::matrix_function_compute<typename NestedEvalTypeClean::PlainObject>::run(m_A, atomic, result);
}
Index rows() const { return m_A.rows(); }
diff --git a/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
index 1acfbed..ff8f6e7 100644
--- a/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
+++ b/eigen/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
@@ -339,7 +339,7 @@ public:
typedef internal::MatrixLogarithmAtomic<DynMatrixType> AtomicType;
AtomicType atomic;
- internal::matrix_function_compute<DerivedEvalTypeClean>::run(m_A, atomic, result);
+ internal::matrix_function_compute<typename DerivedEvalTypeClean::PlainObject>::run(m_A, atomic, result);
}
Index rows() const { return m_A.rows(); }
diff --git a/eigen/unsupported/Eigen/src/Polynomials/Companion.h b/eigen/unsupported/Eigen/src/Polynomials/Companion.h
index e0af6eb..b515c29 100644
--- a/eigen/unsupported/Eigen/src/Polynomials/Companion.h
+++ b/eigen/unsupported/Eigen/src/Polynomials/Companion.h
@@ -75,7 +75,7 @@ class companion
void setPolynomial( const VectorType& poly )
{
const Index deg = poly.size()-1;
- m_monic = Scalar(-1)/poly[deg] * poly.head(deg);
+ m_monic = -1/poly[deg] * poly.head(deg);
//m_bl_diag.setIdentity( deg-1 );
m_bl_diag.setOnes(deg-1);
}
@@ -107,8 +107,8 @@ class companion
* colB and rowB are repectively the multipliers for
* the column and the row in order to balance them.
* */
- bool balanced( RealScalar colNorm, RealScalar rowNorm,
- bool& isBalanced, RealScalar& colB, RealScalar& rowB );
+ bool balanced( Scalar colNorm, Scalar rowNorm,
+ bool& isBalanced, Scalar& colB, Scalar& rowB );
/** Helper function for the balancing algorithm.
* \returns true if the row and the column, having colNorm and rowNorm
@@ -116,8 +116,8 @@ class companion
* colB and rowB are repectively the multipliers for
* the column and the row in order to balance them.
* */
- bool balancedR( RealScalar colNorm, RealScalar rowNorm,
- bool& isBalanced, RealScalar& colB, RealScalar& rowB );
+ bool balancedR( Scalar colNorm, Scalar rowNorm,
+ bool& isBalanced, Scalar& colB, Scalar& rowB );
public:
/**
@@ -139,10 +139,10 @@ class companion
template< typename _Scalar, int _Deg >
inline
-bool companion<_Scalar,_Deg>::balanced( RealScalar colNorm, RealScalar rowNorm,
- bool& isBalanced, RealScalar& colB, RealScalar& rowB )
+bool companion<_Scalar,_Deg>::balanced( Scalar colNorm, Scalar rowNorm,
+ bool& isBalanced, Scalar& colB, Scalar& rowB )
{
- if( RealScalar(0) == colNorm || RealScalar(0) == rowNorm ){ return true; }
+ if( Scalar(0) == colNorm || Scalar(0) == rowNorm ){ return true; }
else
{
//To find the balancing coefficients, if the radix is 2,
@@ -150,29 +150,29 @@ bool companion<_Scalar,_Deg>::balanced( RealScalar colNorm, RealScalar rowNorm,
// \f$ 2^{2\sigma-1} < rowNorm / colNorm \le 2^{2\sigma+1} \f$
// then the balancing coefficient for the row is \f$ 1/2^{\sigma} \f$
// and the balancing coefficient for the column is \f$ 2^{\sigma} \f$
- rowB = rowNorm / radix<RealScalar>();
- colB = RealScalar(1);
- const RealScalar s = colNorm + rowNorm;
+ rowB = rowNorm / radix<Scalar>();
+ colB = Scalar(1);
+ const Scalar s = colNorm + rowNorm;
while (colNorm < rowB)
{
- colB *= radix<RealScalar>();
- colNorm *= radix2<RealScalar>();
+ colB *= radix<Scalar>();
+ colNorm *= radix2<Scalar>();
}
- rowB = rowNorm * radix<RealScalar>();
+ rowB = rowNorm * radix<Scalar>();
while (colNorm >= rowB)
{
- colB /= radix<RealScalar>();
- colNorm /= radix2<RealScalar>();
+ colB /= radix<Scalar>();
+ colNorm /= radix2<Scalar>();
}
//This line is used to avoid insubstantial balancing
- if ((rowNorm + colNorm) < RealScalar(0.95) * s * colB)
+ if ((rowNorm + colNorm) < Scalar(0.95) * s * colB)
{
isBalanced = false;
- rowB = RealScalar(1) / colB;
+ rowB = Scalar(1) / colB;
return false;
}
else{
@@ -182,21 +182,21 @@ bool companion<_Scalar,_Deg>::balanced( RealScalar colNorm, RealScalar rowNorm,
template< typename _Scalar, int _Deg >
inline
-bool companion<_Scalar,_Deg>::balancedR( RealScalar colNorm, RealScalar rowNorm,
- bool& isBalanced, RealScalar& colB, RealScalar& rowB )
+bool companion<_Scalar,_Deg>::balancedR( Scalar colNorm, Scalar rowNorm,
+ bool& isBalanced, Scalar& colB, Scalar& rowB )
{
- if( RealScalar(0) == colNorm || RealScalar(0) == rowNorm ){ return true; }
+ if( Scalar(0) == colNorm || Scalar(0) == rowNorm ){ return true; }
else
{
/**
* Set the norm of the column and the row to the geometric mean
* of the row and column norm
*/
- const RealScalar q = colNorm/rowNorm;
+ const _Scalar q = colNorm/rowNorm;
if( !isApprox( q, _Scalar(1) ) )
{
rowB = sqrt( colNorm/rowNorm );
- colB = RealScalar(1)/rowB;
+ colB = Scalar(1)/rowB;
isBalanced = false;
return false;
@@ -219,8 +219,8 @@ void companion<_Scalar,_Deg>::balance()
while( !hasConverged )
{
hasConverged = true;
- RealScalar colNorm,rowNorm;
- RealScalar colB,rowB;
+ Scalar colNorm,rowNorm;
+ Scalar colB,rowB;
//First row, first column excluding the diagonal
//==============================================
diff --git a/eigen/unsupported/Eigen/src/Polynomials/PolynomialSolver.h b/eigen/unsupported/Eigen/src/Polynomials/PolynomialSolver.h
index 7885942..03198ec 100644
--- a/eigen/unsupported/Eigen/src/Polynomials/PolynomialSolver.h
+++ b/eigen/unsupported/Eigen/src/Polynomials/PolynomialSolver.h
@@ -99,7 +99,7 @@ class PolynomialSolverBase
*/
inline const RootType& greatestRoot() const
{
- std::greater<RealScalar> greater;
+ std::greater<Scalar> greater;
return selectComplexRoot_withRespectToNorm( greater );
}
@@ -108,7 +108,7 @@ class PolynomialSolverBase
*/
inline const RootType& smallestRoot() const
{
- std::less<RealScalar> less;
+ std::less<Scalar> less;
return selectComplexRoot_withRespectToNorm( less );
}
@@ -213,7 +213,7 @@ class PolynomialSolverBase
bool& hasArealRoot,
const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
{
- std::greater<RealScalar> greater;
+ std::greater<Scalar> greater;
return selectRealRoot_withRespectToAbsRealPart( greater, hasArealRoot, absImaginaryThreshold );
}
@@ -236,7 +236,7 @@ class PolynomialSolverBase
bool& hasArealRoot,
const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
{
- std::less<RealScalar> less;
+ std::less<Scalar> less;
return selectRealRoot_withRespectToAbsRealPart( less, hasArealRoot, absImaginaryThreshold );
}
@@ -259,7 +259,7 @@ class PolynomialSolverBase
bool& hasArealRoot,
const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
{
- std::greater<RealScalar> greater;
+ std::greater<Scalar> greater;
return selectRealRoot_withRespectToRealPart( greater, hasArealRoot, absImaginaryThreshold );
}
@@ -282,7 +282,7 @@ class PolynomialSolverBase
bool& hasArealRoot,
const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
{
- std::less<RealScalar> less;
+ std::less<Scalar> less;
return selectRealRoot_withRespectToRealPart( less, hasArealRoot, absImaginaryThreshold );
}
@@ -327,7 +327,7 @@ class PolynomialSolverBase
* However, almost always, correct accuracy is reached even in these cases for 64bit
* (double) floating types and small polynomial degree (<20).
*/
-template<typename _Scalar, int _Deg>
+template< typename _Scalar, int _Deg >
class PolynomialSolver : public PolynomialSolverBase<_Scalar,_Deg>
{
public:
@@ -337,9 +337,7 @@ class PolynomialSolver : public PolynomialSolverBase<_Scalar,_Deg>
EIGEN_POLYNOMIAL_SOLVER_BASE_INHERITED_TYPES( PS_Base )
typedef Matrix<Scalar,_Deg,_Deg> CompanionMatrixType;
- typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
- ComplexEigenSolver<CompanionMatrixType>,
- EigenSolver<CompanionMatrixType> >::type EigenSolverType;
+ typedef EigenSolver<CompanionMatrixType> EigenSolverType;
public:
/** Computes the complex roots of a new polynomial. */
diff --git a/eigen/unsupported/Eigen/src/SparseExtra/MarketIO.h b/eigen/unsupported/Eigen/src/SparseExtra/MarketIO.h
index fc70a24..cdc14f8 100644
--- a/eigen/unsupported/Eigen/src/SparseExtra/MarketIO.h
+++ b/eigen/unsupported/Eigen/src/SparseExtra/MarketIO.h
@@ -12,38 +12,38 @@
#define EIGEN_SPARSE_MARKET_IO_H
#include <iostream>
-#include <vector>
namespace Eigen {
namespace internal
{
- template <typename Scalar, typename StorageIndex>
- inline void GetMarketLine (const char* line, StorageIndex& i, StorageIndex& j, Scalar& value)
+ template <typename Scalar>
+ inline bool GetMarketLine (std::stringstream& line, Index& M, Index& N, Index& i, Index& j, Scalar& value)
{
- std::stringstream sline(line);
- sline >> i >> j >> value;
+ line >> i >> j >> value;
+ i--;
+ j--;
+ if(i>=0 && j>=0 && i<M && j<N)
+ {
+ return true;
+ }
+ else
+ return false;
}
-
- template<> inline void GetMarketLine (const char* line, int& i, int& j, float& value)
- { std::sscanf(line, "%d %d %g", &i, &j, &value); }
-
- template<> inline void GetMarketLine (const char* line, int& i, int& j, double& value)
- { std::sscanf(line, "%d %d %lg", &i, &j, &value); }
-
- template<> inline void GetMarketLine (const char* line, int& i, int& j, std::complex<float>& value)
- { std::sscanf(line, "%d %d %g %g", &i, &j, &numext::real_ref(value), &numext::imag_ref(value)); }
-
- template<> inline void GetMarketLine (const char* line, int& i, int& j, std::complex<double>& value)
- { std::sscanf(line, "%d %d %lg %lg", &i, &j, &numext::real_ref(value), &numext::imag_ref(value)); }
-
- template <typename Scalar, typename StorageIndex>
- inline void GetMarketLine (const char* line, StorageIndex& i, StorageIndex& j, std::complex<Scalar>& value)
+ template <typename Scalar>
+ inline bool GetMarketLine (std::stringstream& line, Index& M, Index& N, Index& i, Index& j, std::complex<Scalar>& value)
{
- std::stringstream sline(line);
Scalar valR, valI;
- sline >> i >> j >> valR >> valI;
- value = std::complex<Scalar>(valR,valI);
+ line >> i >> j >> valR >> valI;
+ i--;
+ j--;
+ if(i>=0 && j>=0 && i<M && j<N)
+ {
+ value = std::complex<Scalar>(valR, valI);
+ return true;
+ }
+ else
+ return false;
}
template <typename RealScalar>
@@ -81,13 +81,13 @@ namespace internal
}
}
- template<typename Scalar, typename StorageIndex>
- inline void PutMatrixElt(Scalar value, StorageIndex row, StorageIndex col, std::ofstream& out)
+ template<typename Scalar>
+ inline void PutMatrixElt(Scalar value, int row, int col, std::ofstream& out)
{
out << row << " "<< col << " " << value << "\n";
}
- template<typename Scalar, typename StorageIndex>
- inline void PutMatrixElt(std::complex<Scalar> value, StorageIndex row, StorageIndex col, std::ofstream& out)
+ template<typename Scalar>
+ inline void PutMatrixElt(std::complex<Scalar> value, int row, int col, std::ofstream& out)
{
out << row << " " << col << " " << value.real() << " " << value.imag() << "\n";
}
@@ -133,20 +133,17 @@ template<typename SparseMatrixType>
bool loadMarket(SparseMatrixType& mat, const std::string& filename)
{
typedef typename SparseMatrixType::Scalar Scalar;
- typedef typename SparseMatrixType::StorageIndex StorageIndex;
+ typedef typename SparseMatrixType::Index Index;
std::ifstream input(filename.c_str(),std::ios::in);
if(!input)
return false;
-
- char rdbuffer[4096];
- input.rdbuf()->pubsetbuf(rdbuffer, 4096);
const int maxBuffersize = 2048;
char buffer[maxBuffersize];
bool readsizes = false;
- typedef Triplet<Scalar,StorageIndex> T;
+ typedef Triplet<Scalar,Index> T;
std::vector<T> elements;
Index M(-1), N(-1), NNZ(-1);
@@ -157,36 +154,33 @@ bool loadMarket(SparseMatrixType& mat, const std::string& filename)
//NOTE An appropriate test should be done on the header to get the symmetry
if(buffer[0]=='%')
continue;
-
+
+ std::stringstream line(buffer);
+
if(!readsizes)
{
- std::stringstream line(buffer);
line >> M >> N >> NNZ;
if(M > 0 && N > 0 && NNZ > 0)
{
readsizes = true;
+ //std::cout << "sizes: " << M << "," << N << "," << NNZ << "\n";
mat.resize(M,N);
mat.reserve(NNZ);
}
}
else
{
- StorageIndex i(-1), j(-1);
+ Index i(-1), j(-1);
Scalar value;
- internal::GetMarketLine(buffer, i, j, value);
-
- i--;
- j--;
- if(i>=0 && j>=0 && i<M && j<N)
+ if( internal::GetMarketLine(line, M, N, i, j, value) )
{
- ++count;
+ ++ count;
elements.push_back(T(i,j,value));
}
- else
+ else
std::cerr << "Invalid read: " << i << "," << j << "\n";
}
}
-
mat.setFromTriplets(elements.begin(), elements.end());
if(count!=NNZ)
std::cerr << count << "!=" << NNZ << "\n";
@@ -231,13 +225,12 @@ template<typename SparseMatrixType>
bool saveMarket(const SparseMatrixType& mat, const std::string& filename, int sym = 0)
{
typedef typename SparseMatrixType::Scalar Scalar;
- typedef typename SparseMatrixType::RealScalar RealScalar;
std::ofstream out(filename.c_str(),std::ios::out);
if(!out)
return false;
out.flags(std::ios_base::scientific);
- out.precision(std::numeric_limits<RealScalar>::digits10 + 2);
+ out.precision(64);
std::string header;
internal::putMarketHeader<Scalar>(header, sym);
out << header << std::endl;
@@ -248,6 +241,7 @@ bool saveMarket(const SparseMatrixType& mat, const std::string& filename, int sy
{
++ count;
internal::PutMatrixElt(it.value(), it.row()+1, it.col()+1, out);
+ // out << it.row()+1 << " " << it.col()+1 << " " << it.value() << "\n";
}
out.close();
return true;
@@ -256,14 +250,13 @@ bool saveMarket(const SparseMatrixType& mat, const std::string& filename, int sy
template<typename VectorType>
bool saveMarketVector (const VectorType& vec, const std::string& filename)
{
- typedef typename VectorType::Scalar Scalar;
- typedef typename VectorType::RealScalar RealScalar;
+ typedef typename VectorType::Scalar Scalar;
std::ofstream out(filename.c_str(),std::ios::out);
if(!out)
return false;
out.flags(std::ios_base::scientific);
- out.precision(std::numeric_limits<RealScalar>::digits10 + 2);
+ out.precision(64);
if(internal::is_same<Scalar, std::complex<float> >::value || internal::is_same<Scalar, std::complex<double> >::value)
out << "%%MatrixMarket matrix array complex general\n";
else
diff --git a/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
index 369ad97..f524d71 100644
--- a/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
+++ b/eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
@@ -122,8 +122,8 @@ struct lgamma_impl<float> {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE float run(float x) {
#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
- int dummy;
- return ::lgammaf_r(x, &dummy);
+ int signgam;
+ return ::lgammaf_r(x, &signgam);
#else
return ::lgammaf(x);
#endif
@@ -135,8 +135,8 @@ struct lgamma_impl<double> {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE double run(double x) {
#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
- int dummy;
- return ::lgamma_r(x, &dummy);
+ int signgam;
+ return ::lgamma_r(x, &signgam);
#else
return ::lgamma(x);
#endif
diff --git a/eigen/unsupported/doc/examples/EulerAngles.cpp b/eigen/unsupported/doc/examples/EulerAngles.cpp
index 3f8ca8c..1ef6aee 100644
--- a/eigen/unsupported/doc/examples/EulerAngles.cpp
+++ b/eigen/unsupported/doc/examples/EulerAngles.cpp
@@ -23,7 +23,7 @@ int main()
// Some Euler angles representation that our plane use.
EulerAnglesZYZd planeAngles(0.78474, 0.5271, -0.513794);
- MyArmyAngles planeAnglesInMyArmyAngles(planeAngles);
+ MyArmyAngles planeAnglesInMyArmyAngles = MyArmyAngles::FromRotation<true, false, false>(planeAngles);
std::cout << "vehicle angles(MyArmy): " << vehicleAngles << std::endl;
std::cout << "plane angles(ZYZ): " << planeAngles << std::endl;
@@ -37,7 +37,7 @@ int main()
Quaterniond planeRotated = AngleAxisd(-0.342, Vector3d::UnitY()) * planeAngles;
planeAngles = planeRotated;
- planeAnglesInMyArmyAngles = planeRotated;
+ planeAnglesInMyArmyAngles = MyArmyAngles::FromRotation<true, false, false>(planeRotated);
std::cout << "new plane angles(ZYZ): " << planeAngles << std::endl;
std::cout << "new plane angles(MyArmy): " << planeAnglesInMyArmyAngles << std::endl;
diff --git a/eigen/unsupported/test/CMakeLists.txt b/eigen/unsupported/test/CMakeLists.txt
index 003c9de..b5fa1c8 100644
--- a/eigen/unsupported/test/CMakeLists.txt
+++ b/eigen/unsupported/test/CMakeLists.txt
@@ -21,17 +21,6 @@ include_directories(../../test ../../unsupported ../../Eigen
find_package (Threads)
-find_package(Xsmm)
-if(XSMM_FOUND)
- add_definitions("-DEIGEN_USE_LIBXSMM")
- include_directories(${XSMM_INCLUDES})
- link_directories(${XSMM_LIBRARIES})
- set(EXTERNAL_LIBS ${EXTERNAL_LIBS} xsmm)
- ei_add_property(EIGEN_TESTED_BACKENDS "Xsmm, ")
-else(XSMM_FOUND)
- ei_add_property(EIGEN_MISSING_BACKENDS "Xsmm, ")
-endif(XSMM_FOUND)
-
find_package(GoogleHash)
if(GOOGLEHASH_FOUND)
add_definitions("-DEIGEN_GOOGLEHASH_SUPPORT")
@@ -157,16 +146,6 @@ if(EIGEN_TEST_CXX11)
ei_add_test_sycl(cxx11_tensor_broadcast_sycl "-std=c++11")
ei_add_test_sycl(cxx11_tensor_device_sycl "-std=c++11")
ei_add_test_sycl(cxx11_tensor_reduction_sycl "-std=c++11")
- ei_add_test_sycl(cxx11_tensor_morphing_sycl "-std=c++11")
- ei_add_test_sycl(cxx11_tensor_shuffling_sycl "-std=c++11")
- ei_add_test_sycl(cxx11_tensor_padding_sycl "-std=c++11")
- ei_add_test_sycl(cxx11_tensor_builtins_sycl "-std=c++11")
- ei_add_test_sycl(cxx11_tensor_contract_sycl "-std=c++11")
- ei_add_test_sycl(cxx11_tensor_concatenation_sycl "-std=c++11")
- ei_add_test_sycl(cxx11_tensor_reverse_sycl "-std=c++11")
- ei_add_test_sycl(cxx11_tensor_convolution_sycl "-std=c++11")
- ei_add_test_sycl(cxx11_tensor_striding_sycl "-std=c++11")
- ei_add_test_sycl(cxx11_tensor_chipping_sycl "-std=c++11")
endif(EIGEN_TEST_SYCL)
# It should be safe to always run these tests as there is some fallback code for
# older compiler that don't support cxx11.
diff --git a/eigen/unsupported/test/EulerAngles.cpp b/eigen/unsupported/test/EulerAngles.cpp
index 79ee728..a8cb528 100644
--- a/eigen/unsupported/test/EulerAngles.cpp
+++ b/eigen/unsupported/test/EulerAngles.cpp
@@ -13,219 +13,146 @@
using namespace Eigen;
-// Unfortunately, we need to specialize it in order to work. (We could add it in main.h test framework)
-template <typename Scalar, class System>
-bool verifyIsApprox(const Eigen::EulerAngles<Scalar, System>& a, const Eigen::EulerAngles<Scalar, System>& b)
-{
- return verifyIsApprox(a.angles(), b.angles());
-}
-
-// Verify that x is in the approxed range [a, b]
-#define VERIFY_APPROXED_RANGE(a, x, b) \
- do { \
- VERIFY_IS_APPROX_OR_LESS_THAN(a, x); \
- VERIFY_IS_APPROX_OR_LESS_THAN(x, b); \
- } while(0)
-
-const char X = EULER_X;
-const char Y = EULER_Y;
-const char Z = EULER_Z;
-
-template<typename Scalar, class EulerSystem>
-void verify_euler(const EulerAngles<Scalar, EulerSystem>& e)
+template<typename EulerSystem, typename Scalar>
+void verify_euler_ranged(const Matrix<Scalar,3,1>& ea,
+ bool positiveRangeAlpha, bool positiveRangeBeta, bool positiveRangeGamma)
{
typedef EulerAngles<Scalar, EulerSystem> EulerAnglesType;
typedef Matrix<Scalar,3,3> Matrix3;
typedef Matrix<Scalar,3,1> Vector3;
typedef Quaternion<Scalar> QuaternionType;
typedef AngleAxis<Scalar> AngleAxisType;
+ using std::abs;
- const Scalar ONE = Scalar(1);
- const Scalar HALF_PI = Scalar(EIGEN_PI / 2);
- const Scalar PI = Scalar(EIGEN_PI);
+ Scalar alphaRangeStart, alphaRangeEnd;
+ Scalar betaRangeStart, betaRangeEnd;
+ Scalar gammaRangeStart, gammaRangeEnd;
- // It's very important calc the acceptable precision depending on the distance from the pole.
- const Scalar longitudeRadius = std::abs(
- EulerSystem::IsTaitBryan ?
- std::cos(e.beta()) :
- std::sin(e.beta())
- );
- Scalar precision = test_precision<Scalar>() / longitudeRadius;
+ if (positiveRangeAlpha)
+ {
+ alphaRangeStart = Scalar(0);
+ alphaRangeEnd = Scalar(2 * EIGEN_PI);
+ }
+ else
+ {
+ alphaRangeStart = -Scalar(EIGEN_PI);
+ alphaRangeEnd = Scalar(EIGEN_PI);
+ }
- Scalar betaRangeStart, betaRangeEnd;
- if (EulerSystem::IsTaitBryan)
+ if (positiveRangeBeta)
+ {
+ betaRangeStart = Scalar(0);
+ betaRangeEnd = Scalar(2 * EIGEN_PI);
+ }
+ else
+ {
+ betaRangeStart = -Scalar(EIGEN_PI);
+ betaRangeEnd = Scalar(EIGEN_PI);
+ }
+
+ if (positiveRangeGamma)
{
- betaRangeStart = -HALF_PI;
- betaRangeEnd = HALF_PI;
+ gammaRangeStart = Scalar(0);
+ gammaRangeEnd = Scalar(2 * EIGEN_PI);
}
else
{
- if (!EulerSystem::IsBetaOpposite)
- {
- betaRangeStart = 0;
- betaRangeEnd = PI;
- }
- else
- {
- betaRangeStart = -PI;
- betaRangeEnd = 0;
- }
+ gammaRangeStart = -Scalar(EIGEN_PI);
+ gammaRangeEnd = Scalar(EIGEN_PI);
}
+ const int i = EulerSystem::AlphaAxisAbs - 1;
+ const int j = EulerSystem::BetaAxisAbs - 1;
+ const int k = EulerSystem::GammaAxisAbs - 1;
+
+ const int iFactor = EulerSystem::IsAlphaOpposite ? -1 : 1;
+ const int jFactor = EulerSystem::IsBetaOpposite ? -1 : 1;
+ const int kFactor = EulerSystem::IsGammaOpposite ? -1 : 1;
+
const Vector3 I = EulerAnglesType::AlphaAxisVector();
const Vector3 J = EulerAnglesType::BetaAxisVector();
const Vector3 K = EulerAnglesType::GammaAxisVector();
- // Is approx checks
- VERIFY(e.isApprox(e));
- VERIFY_IS_APPROX(e, e);
- VERIFY_IS_NOT_APPROX(e, EulerAnglesType(e.alpha() + ONE, e.beta() + ONE, e.gamma() + ONE));
-
- const Matrix3 m(e);
- VERIFY_IS_APPROX(Scalar(m.determinant()), ONE);
-
- EulerAnglesType ebis(m);
+ EulerAnglesType e(ea[0], ea[1], ea[2]);
- // When no roll(acting like polar representation), we have the best precision.
- // One of those cases is when the Euler angles are on the pole, and because it's singular case,
- // the computation returns no roll.
- if (ebis.beta() == 0)
- precision = test_precision<Scalar>();
+ Matrix3 m(e);
+ Vector3 eabis = EulerAnglesType(m, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma).angles();
// Check that eabis in range
- VERIFY_APPROXED_RANGE(-PI, ebis.alpha(), PI);
- VERIFY_APPROXED_RANGE(betaRangeStart, ebis.beta(), betaRangeEnd);
- VERIFY_APPROXED_RANGE(-PI, ebis.gamma(), PI);
-
- const Matrix3 mbis(AngleAxisType(ebis.alpha(), I) * AngleAxisType(ebis.beta(), J) * AngleAxisType(ebis.gamma(), K));
- VERIFY_IS_APPROX(Scalar(mbis.determinant()), ONE);
- VERIFY_IS_APPROX(mbis, ebis.toRotationMatrix());
- /*std::cout << "===================\n" <<
- "e: " << e << std::endl <<
- "eabis: " << eabis.transpose() << std::endl <<
- "m: " << m << std::endl <<
- "mbis: " << mbis << std::endl <<
- "X: " << (m * Vector3::UnitX()).transpose() << std::endl <<
- "X: " << (mbis * Vector3::UnitX()).transpose() << std::endl;*/
- VERIFY(m.isApprox(mbis, precision));
-
- // Test if ea and eabis are the same
- // Need to check both singular and non-singular cases
- // There are two singular cases.
- // 1. When I==K and sin(ea(1)) == 0
- // 2. When I!=K and cos(ea(1)) == 0
-
- // TODO: Make this test work well, and use range saturation function.
- /*// If I==K, and ea[1]==0, then there no unique solution.
- // The remark apply in the case where I!=K, and |ea[1]| is close to +-pi/2.
- if( (i!=k || ea[1]!=0) && (i==k || !internal::isApprox(abs(ea[1]),Scalar(EIGEN_PI/2),test_precision<Scalar>())) )
- VERIFY_IS_APPROX(ea, eabis);*/
+ VERIFY(alphaRangeStart <= eabis[0] && eabis[0] <= alphaRangeEnd);
+ VERIFY(betaRangeStart <= eabis[1] && eabis[1] <= betaRangeEnd);
+ VERIFY(gammaRangeStart <= eabis[2] && eabis[2] <= gammaRangeEnd);
- // Quaternions
- const QuaternionType q(e);
- ebis = q;
- const QuaternionType qbis(ebis);
- VERIFY(internal::isApprox<Scalar>(std::abs(q.dot(qbis)), ONE, precision));
- //VERIFY_IS_APPROX(eabis, eabis2);// Verify that the euler angles are still the same
+ Vector3 eabis2 = m.eulerAngles(i, j, k);
- // A suggestion for simple product test when will be supported.
- /*EulerAnglesType e2(PI/2, PI/2, PI/2);
- Matrix3 m2(e2);
- VERIFY_IS_APPROX(e*e2, m*m2);*/
-}
-
-template<signed char A, signed char B, signed char C, typename Scalar>
-void verify_euler_vec(const Matrix<Scalar,3,1>& ea)
-{
- verify_euler(EulerAngles<Scalar, EulerSystem<A, B, C> >(ea[0], ea[1], ea[2]));
-}
-
-template<signed char A, signed char B, signed char C, typename Scalar>
-void verify_euler_all_neg(const Matrix<Scalar,3,1>& ea)
-{
- verify_euler_vec<+A,+B,+C>(ea);
- verify_euler_vec<+A,+B,-C>(ea);
- verify_euler_vec<+A,-B,+C>(ea);
- verify_euler_vec<+A,-B,-C>(ea);
+ // Invert the relevant axes
+ eabis2[0] *= iFactor;
+ eabis2[1] *= jFactor;
+ eabis2[2] *= kFactor;
- verify_euler_vec<-A,+B,+C>(ea);
- verify_euler_vec<-A,+B,-C>(ea);
- verify_euler_vec<-A,-B,+C>(ea);
- verify_euler_vec<-A,-B,-C>(ea);
-}
-
-template<typename Scalar> void check_all_var(const Matrix<Scalar,3,1>& ea)
-{
- verify_euler_all_neg<X,Y,Z>(ea);
- verify_euler_all_neg<X,Y,X>(ea);
- verify_euler_all_neg<X,Z,Y>(ea);
- verify_euler_all_neg<X,Z,X>(ea);
+ // Saturate the angles to the correct range
+ if (positiveRangeAlpha && (eabis2[0] < 0))
+ eabis2[0] += Scalar(2 * EIGEN_PI);
+ if (positiveRangeBeta && (eabis2[1] < 0))
+ eabis2[1] += Scalar(2 * EIGEN_PI);
+ if (positiveRangeGamma && (eabis2[2] < 0))
+ eabis2[2] += Scalar(2 * EIGEN_PI);
- verify_euler_all_neg<Y,Z,X>(ea);
- verify_euler_all_neg<Y,Z,Y>(ea);
- verify_euler_all_neg<Y,X,Z>(ea);
- verify_euler_all_neg<Y,X,Y>(ea);
+ VERIFY_IS_APPROX(eabis, eabis2);// Verify that our estimation is the same as m.eulerAngles() is
- verify_euler_all_neg<Z,X,Y>(ea);
- verify_euler_all_neg<Z,X,Z>(ea);
- verify_euler_all_neg<Z,Y,X>(ea);
- verify_euler_all_neg<Z,Y,Z>(ea);
-}
-
-template<typename Scalar> void check_singular_cases(const Scalar& singularBeta)
-{
- typedef Matrix<Scalar,3,1> Vector3;
- const Scalar PI = Scalar(EIGEN_PI);
+ Matrix3 mbis(AngleAxisType(eabis[0], I) * AngleAxisType(eabis[1], J) * AngleAxisType(eabis[2], K));
+ VERIFY_IS_APPROX(m, mbis);
- for (Scalar epsilon = NumTraits<Scalar>::epsilon(); epsilon < 1; epsilon *= Scalar(1.2))
+ // Tests that are only relevant for no possitive range
+ if (!(positiveRangeAlpha || positiveRangeBeta || positiveRangeGamma))
{
- check_all_var(Vector3(PI/4, singularBeta, PI/3));
- check_all_var(Vector3(PI/4, singularBeta - epsilon, PI/3));
- check_all_var(Vector3(PI/4, singularBeta - Scalar(1.5)*epsilon, PI/3));
- check_all_var(Vector3(PI/4, singularBeta - 2*epsilon, PI/3));
- check_all_var(Vector3(PI*Scalar(0.8), singularBeta - epsilon, Scalar(0.9)*PI));
- check_all_var(Vector3(PI*Scalar(-0.9), singularBeta + epsilon, PI*Scalar(0.3)));
- check_all_var(Vector3(PI*Scalar(-0.6), singularBeta + Scalar(1.5)*epsilon, PI*Scalar(0.3)));
- check_all_var(Vector3(PI*Scalar(-0.5), singularBeta + 2*epsilon, PI*Scalar(0.4)));
- check_all_var(Vector3(PI*Scalar(0.9), singularBeta + epsilon, Scalar(0.8)*PI));
+ /* If I==K, and ea[1]==0, then there no unique solution. */
+ /* The remark apply in the case where I!=K, and |ea[1]| is close to pi/2. */
+ if( (i!=k || ea[1]!=0) && (i==k || !internal::isApprox(abs(ea[1]),Scalar(EIGEN_PI/2),test_precision<Scalar>())) )
+ VERIFY((ea-eabis).norm() <= test_precision<Scalar>());
+
+ // approx_or_less_than does not work for 0
+ VERIFY(0 < eabis[0] || test_isMuchSmallerThan(eabis[0], Scalar(1)));
}
- // This one for sanity, it had a problem with near pole cases in float scalar.
- check_all_var(Vector3(PI*Scalar(0.8), singularBeta - Scalar(1E-6), Scalar(0.9)*PI));
+ // Quaternions
+ QuaternionType q(e);
+ eabis = EulerAnglesType(q, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma).angles();
+ VERIFY_IS_APPROX(eabis, eabis2);// Verify that the euler angles are still the same
}
-template<typename Scalar> void eulerangles_manual()
+template<typename EulerSystem, typename Scalar>
+void verify_euler(const Matrix<Scalar,3,1>& ea)
{
- typedef Matrix<Scalar,3,1> Vector3;
- const Vector3 Zero = Vector3::Zero();
- const Scalar PI = Scalar(EIGEN_PI);
-
- check_all_var(Zero);
-
- // singular cases
- check_singular_cases(PI/2);
- check_singular_cases(-PI/2);
-
- check_singular_cases(Scalar(0));
- check_singular_cases(Scalar(-0));
-
- check_singular_cases(PI);
- check_singular_cases(-PI);
-
- // non-singular cases
- VectorXd alpha = VectorXd::LinSpaced(Eigen::Sequential, 20, Scalar(-0.99) * PI, PI);
- VectorXd beta = VectorXd::LinSpaced(Eigen::Sequential, 20, Scalar(-0.49) * PI, Scalar(0.49) * PI);
- VectorXd gamma = VectorXd::LinSpaced(Eigen::Sequential, 20, Scalar(-0.99) * PI, PI);
- for (int i = 0; i < alpha.size(); ++i) {
- for (int j = 0; j < beta.size(); ++j) {
- for (int k = 0; k < gamma.size(); ++k) {
- check_all_var(Vector3d(alpha(i), beta(j), gamma(k)));
- }
- }
- }
+ verify_euler_ranged<EulerSystem>(ea, false, false, false);
+ verify_euler_ranged<EulerSystem>(ea, false, false, true);
+ verify_euler_ranged<EulerSystem>(ea, false, true, false);
+ verify_euler_ranged<EulerSystem>(ea, false, true, true);
+ verify_euler_ranged<EulerSystem>(ea, true, false, false);
+ verify_euler_ranged<EulerSystem>(ea, true, false, true);
+ verify_euler_ranged<EulerSystem>(ea, true, true, false);
+ verify_euler_ranged<EulerSystem>(ea, true, true, true);
}
-template<typename Scalar> void eulerangles_rand()
+template<typename Scalar> void check_all_var(const Matrix<Scalar,3,1>& ea)
+{
+ verify_euler<EulerSystemXYZ>(ea);
+ verify_euler<EulerSystemXYX>(ea);
+ verify_euler<EulerSystemXZY>(ea);
+ verify_euler<EulerSystemXZX>(ea);
+
+ verify_euler<EulerSystemYZX>(ea);
+ verify_euler<EulerSystemYZY>(ea);
+ verify_euler<EulerSystemYXZ>(ea);
+ verify_euler<EulerSystemYXY>(ea);
+
+ verify_euler<EulerSystemZXY>(ea);
+ verify_euler<EulerSystemZXZ>(ea);
+ verify_euler<EulerSystemZYX>(ea);
+ verify_euler<EulerSystemZYZ>(ea);
+}
+
+template<typename Scalar> void eulerangles()
{
typedef Matrix<Scalar,3,3> Matrix3;
typedef Matrix<Scalar,3,1> Vector3;
@@ -274,19 +201,8 @@ template<typename Scalar> void eulerangles_rand()
void test_EulerAngles()
{
- // Simple cast test
- EulerAnglesXYZd onesEd(1, 1, 1);
- EulerAnglesXYZf onesEf = onesEd.cast<float>();
- VERIFY_IS_APPROX(onesEd, onesEf.cast<double>());
-
- CALL_SUBTEST_1( eulerangles_manual<float>() );
- CALL_SUBTEST_2( eulerangles_manual<double>() );
-
for(int i = 0; i < g_repeat; i++) {
- CALL_SUBTEST_3( eulerangles_rand<float>() );
- CALL_SUBTEST_4( eulerangles_rand<double>() );
+ CALL_SUBTEST_1( eulerangles<float>() );
+ CALL_SUBTEST_2( eulerangles<double>() );
}
-
- // TODO: Add tests for auto diff
- // TODO: Add tests for complex numbers
}
diff --git a/eigen/unsupported/test/autodiff_scalar.cpp b/eigen/unsupported/test/autodiff_scalar.cpp
index 4df2f5c..9cf1128 100644
--- a/eigen/unsupported/test/autodiff_scalar.cpp
+++ b/eigen/unsupported/test/autodiff_scalar.cpp
@@ -72,6 +72,20 @@ template<typename Scalar> void check_hyperbolic_functions()
VERIFY_IS_APPROX(res3.derivatives().x(), Scalar(0.339540557256150));
}
+template <typename Scalar>
+void check_limits_specialization()
+{
+ typedef Eigen::Matrix<Scalar, 1, 1> Deriv;
+ typedef Eigen::AutoDiffScalar<Deriv> AD;
+
+ typedef std::numeric_limits<AD> A;
+ typedef std::numeric_limits<Scalar> B;
+
+#if EIGEN_HAS_CXX11
+ VERIFY(bool(std::is_base_of<B, A>::value));
+#endif
+}
+
void test_autodiff_scalar()
{
for(int i = 0; i < g_repeat; i++) {
@@ -79,5 +93,6 @@ void test_autodiff_scalar()
CALL_SUBTEST_2( check_atan2<double>() );
CALL_SUBTEST_3( check_hyperbolic_functions<float>() );
CALL_SUBTEST_4( check_hyperbolic_functions<double>() );
+ CALL_SUBTEST_5( check_limits_specialization<double>());
}
}
diff --git a/eigen/unsupported/test/cxx11_non_blocking_thread_pool.cpp b/eigen/unsupported/test/cxx11_non_blocking_thread_pool.cpp
index 48cd2d4..5f9bb93 100644
--- a/eigen/unsupported/test/cxx11_non_blocking_thread_pool.cpp
+++ b/eigen/unsupported/test/cxx11_non_blocking_thread_pool.cpp
@@ -11,7 +11,6 @@
#define EIGEN_USE_THREADS
#include "main.h"
#include "Eigen/CXX11/ThreadPool"
-#include "Eigen/CXX11/Tensor"
static void test_create_destroy_empty_pool()
{
@@ -23,11 +22,11 @@ static void test_create_destroy_empty_pool()
}
-static void test_parallelism(bool allow_spinning)
+static void test_parallelism()
{
// Test we never-ever fail to match available tasks with idle threads.
const int kThreads = 16; // code below expects that this is a multiple of 4
- NonBlockingThreadPool tp(kThreads, allow_spinning);
+ NonBlockingThreadPool tp(kThreads);
VERIFY_IS_EQUAL(tp.NumThreads(), kThreads);
VERIFY_IS_EQUAL(tp.CurrentThreadId(), -1);
for (int iter = 0; iter < 100; ++iter) {
@@ -101,25 +100,8 @@ static void test_parallelism(bool allow_spinning)
}
}
-
-static void test_cancel()
-{
- NonBlockingThreadPool tp(2);
-
- // Schedule a large number of closure that each sleeps for one second. This
- // will keep the thread pool busy for much longer than the default test timeout.
- for (int i = 0; i < 1000; ++i) {
- tp.Schedule([]() { EIGEN_SLEEP(2000); });
- }
-
- // Cancel the processing of all the closures that are still pending.
- tp.Cancel();
-}
-
void test_cxx11_non_blocking_thread_pool()
{
CALL_SUBTEST(test_create_destroy_empty_pool());
- CALL_SUBTEST(test_parallelism(true));
- CALL_SUBTEST(test_parallelism(false));
- CALL_SUBTEST(test_cancel());
+ CALL_SUBTEST(test_parallelism());
}
diff --git a/eigen/unsupported/test/cxx11_tensor_broadcast_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
index 21fdfca..7201bfe 100644
--- a/eigen/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
+++ b/eigen/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
@@ -14,7 +14,7 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_broadcast_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_SYCL
#include "main.h"
@@ -25,99 +25,39 @@ using Eigen::SyclDevice;
using Eigen::Tensor;
using Eigen::TensorMap;
-template <typename DataType, int DataLayout, typename IndexType>
-static void test_broadcast_sycl_fixed(const Eigen::SyclDevice &sycl_device){
-
- // BROADCAST test:
- IndexType inDim1=2;
- IndexType inDim2=3;
- IndexType inDim3=5;
- IndexType inDim4=7;
- IndexType bDim1=2;
- IndexType bDim2=3;
- IndexType bDim3=1;
- IndexType bDim4=4;
- array<IndexType, 4> in_range = {{inDim1, inDim2, inDim3, inDim4}};
- array<IndexType, 4> broadcasts = {{bDim1, bDim2, bDim3, bDim4}};
- array<IndexType, 4> out_range; // = in_range * broadcasts
- for (size_t i = 0; i < out_range.size(); ++i)
- out_range[i] = in_range[i] * broadcasts[i];
-
- Tensor<DataType, 4, DataLayout, IndexType> input(in_range);
- Tensor<DataType, 4, DataLayout, IndexType> out(out_range);
-
- for (size_t i = 0; i < in_range.size(); ++i)
- VERIFY_IS_EQUAL(out.dimension(i), out_range[i]);
-
-
- for (IndexType i = 0; i < input.size(); ++i)
- input(i) = static_cast<DataType>(i);
-
- DataType * gpu_in_data = static_cast<DataType*>(sycl_device.allocate(input.dimensions().TotalSize()*sizeof(DataType)));
- DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
-
- TensorMap<TensorFixedSize<DataType, Sizes<2, 3, 5, 7>, DataLayout, IndexType>> gpu_in(gpu_in_data, in_range);
- TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu_out(gpu_out_data, out_range);
- sycl_device.memcpyHostToDevice(gpu_in_data, input.data(),(input.dimensions().TotalSize())*sizeof(DataType));
- gpu_out.device(sycl_device) = gpu_in.broadcast(broadcasts);
- sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
-
- for (IndexType i = 0; i < inDim1*bDim1; ++i) {
- for (IndexType j = 0; j < inDim2*bDim2; ++j) {
- for (IndexType k = 0; k < inDim3*bDim3; ++k) {
- for (IndexType l = 0; l < inDim4*bDim4; ++l) {
- VERIFY_IS_APPROX(input(i%2,j%3,k%5,l%7), out(i,j,k,l));
- }
- }
- }
- }
- printf("Broadcast Test with fixed size Passed\n");
- sycl_device.deallocate(gpu_in_data);
- sycl_device.deallocate(gpu_out_data);
-}
-
-template <typename DataType, int DataLayout, typename IndexType>
static void test_broadcast_sycl(const Eigen::SyclDevice &sycl_device){
// BROADCAST test:
- IndexType inDim1=2;
- IndexType inDim2=3;
- IndexType inDim3=5;
- IndexType inDim4=7;
- IndexType bDim1=2;
- IndexType bDim2=3;
- IndexType bDim3=1;
- IndexType bDim4=4;
- array<IndexType, 4> in_range = {{inDim1, inDim2, inDim3, inDim4}};
- array<IndexType, 4> broadcasts = {{bDim1, bDim2, bDim3, bDim4}};
- array<IndexType, 4> out_range; // = in_range * broadcasts
+ array<int, 4> in_range = {{2, 3, 5, 7}};
+ array<int, 4> broadcasts = {{2, 3, 1, 4}};
+ array<int, 4> out_range; // = in_range * broadcasts
for (size_t i = 0; i < out_range.size(); ++i)
out_range[i] = in_range[i] * broadcasts[i];
- Tensor<DataType, 4, DataLayout, IndexType> input(in_range);
- Tensor<DataType, 4, DataLayout, IndexType> out(out_range);
+ Tensor<float, 4> input(in_range);
+ Tensor<float, 4> out(out_range);
for (size_t i = 0; i < in_range.size(); ++i)
VERIFY_IS_EQUAL(out.dimension(i), out_range[i]);
- for (IndexType i = 0; i < input.size(); ++i)
- input(i) = static_cast<DataType>(i);
+ for (int i = 0; i < input.size(); ++i)
+ input(i) = static_cast<float>(i);
- DataType * gpu_in_data = static_cast<DataType*>(sycl_device.allocate(input.dimensions().TotalSize()*sizeof(DataType)));
- DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
+ float * gpu_in_data = static_cast<float*>(sycl_device.allocate(input.dimensions().TotalSize()*sizeof(float)));
+ float * gpu_out_data = static_cast<float*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(float)));
- TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu_in(gpu_in_data, in_range);
- TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu_out(gpu_out_data, out_range);
- sycl_device.memcpyHostToDevice(gpu_in_data, input.data(),(input.dimensions().TotalSize())*sizeof(DataType));
+ TensorMap<Tensor<float, 4>> gpu_in(gpu_in_data, in_range);
+ TensorMap<Tensor<float, 4>> gpu_out(gpu_out_data, out_range);
+ sycl_device.memcpyHostToDevice(gpu_in_data, input.data(),(input.dimensions().TotalSize())*sizeof(float));
gpu_out.device(sycl_device) = gpu_in.broadcast(broadcasts);
- sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
- for (IndexType i = 0; i < inDim1*bDim1; ++i) {
- for (IndexType j = 0; j < inDim2*bDim2; ++j) {
- for (IndexType k = 0; k < inDim3*bDim3; ++k) {
- for (IndexType l = 0; l < inDim4*bDim4; ++l) {
- VERIFY_IS_APPROX(input(i%inDim1,j%inDim2,k%inDim3,l%inDim4), out(i,j,k,l));
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 9; ++j) {
+ for (int k = 0; k < 5; ++k) {
+ for (int l = 0; l < 28; ++l) {
+ VERIFY_IS_APPROX(input(i%2,j%3,k%5,l%7), out(i,j,k,l));
}
}
}
@@ -127,18 +67,8 @@ static void test_broadcast_sycl(const Eigen::SyclDevice &sycl_device){
sycl_device.deallocate(gpu_out_data);
}
-template<typename DataType> void sycl_broadcast_test_per_device(const cl::sycl::device& d){
- std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
- QueueInterface queueInterface(d);
- auto sycl_device = Eigen::SyclDevice(&queueInterface);
- test_broadcast_sycl<DataType, RowMajor, int64_t>(sycl_device);
- test_broadcast_sycl<DataType, ColMajor, int64_t>(sycl_device);
- test_broadcast_sycl_fixed<DataType, RowMajor, int64_t>(sycl_device);
- test_broadcast_sycl_fixed<DataType, ColMajor, int64_t>(sycl_device);
-}
-
void test_cxx11_tensor_broadcast_sycl() {
- for (const auto& device :Eigen::get_sycl_supported_devices()) {
- CALL_SUBTEST(sycl_broadcast_test_per_device<float>(device));
- }
+ cl::sycl::gpu_selector s;
+ Eigen::SyclDevice sycl_device(s);
+ CALL_SUBTEST(test_broadcast_sycl(sycl_device));
}
diff --git a/eigen/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_builtins_sycl.cpp
deleted file mode 100644
index 400a31d..0000000
--- a/eigen/unsupported/test/cxx11_tensor_builtins_sycl.cpp
+++ /dev/null
@@ -1,267 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016
-// Mehdi Goli Codeplay Software Ltd.
-// Ralph Potter Codeplay Software Ltd.
-// Luke Iwanski Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_builtins_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
-#define EIGEN_USE_SYCL
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-using Eigen::array;
-using Eigen::SyclDevice;
-using Eigen::Tensor;
-using Eigen::TensorMap;
-
-namespace std {
-template <typename T> T rsqrt(T x) { return 1 / std::sqrt(x); }
-template <typename T> T square(T x) { return x * x; }
-template <typename T> T cube(T x) { return x * x * x; }
-template <typename T> T inverse(T x) { return 1 / x; }
-}
-
-#define TEST_UNARY_BUILTINS_FOR_SCALAR(FUNC, SCALAR, OPERATOR, Layout) \
- { \
- /* out OPERATOR in.FUNC() */ \
- Tensor<SCALAR, 3, Layout, int64_t> in(tensorRange); \
- Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \
- in = in.random() + static_cast<SCALAR>(0.01); \
- out = out.random() + static_cast<SCALAR>(0.01); \
- Tensor<SCALAR, 3, Layout, int64_t> reference(out); \
- SCALAR *gpu_data = static_cast<SCALAR *>( \
- sycl_device.allocate(in.size() * sizeof(SCALAR))); \
- SCALAR *gpu_data_out = static_cast<SCALAR *>( \
- sycl_device.allocate(out.size() * sizeof(SCALAR))); \
- TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu(gpu_data, tensorRange); \
- TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \
- sycl_device.memcpyHostToDevice(gpu_data, in.data(), \
- (in.size()) * sizeof(SCALAR)); \
- sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), \
- (out.size()) * sizeof(SCALAR)); \
- gpu_out.device(sycl_device) OPERATOR gpu.FUNC(); \
- sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \
- (out.size()) * sizeof(SCALAR)); \
- for (int64_t i = 0; i < out.size(); ++i) { \
- SCALAR ver = reference(i); \
- ver OPERATOR std::FUNC(in(i)); \
- VERIFY_IS_APPROX(out(i), ver); \
- } \
- sycl_device.deallocate(gpu_data); \
- sycl_device.deallocate(gpu_data_out); \
- } \
- { \
- /* out OPERATOR out.FUNC() */ \
- Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \
- out = out.random() + static_cast<SCALAR>(0.01); \
- Tensor<SCALAR, 3, Layout, int64_t> reference(out); \
- SCALAR *gpu_data_out = static_cast<SCALAR *>( \
- sycl_device.allocate(out.size() * sizeof(SCALAR))); \
- TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \
- sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), \
- (out.size()) * sizeof(SCALAR)); \
- gpu_out.device(sycl_device) OPERATOR gpu_out.FUNC(); \
- sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \
- (out.size()) * sizeof(SCALAR)); \
- for (int64_t i = 0; i < out.size(); ++i) { \
- SCALAR ver = reference(i); \
- ver OPERATOR std::FUNC(reference(i)); \
- VERIFY_IS_APPROX(out(i), ver); \
- } \
- sycl_device.deallocate(gpu_data_out); \
- }
-
-#define TEST_UNARY_BUILTINS_OPERATOR(SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(sqrt, SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(rsqrt, SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(square, SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(cube, SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(inverse, SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(tanh, SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(exp, SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(expm1, SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(log, SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(ceil, SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(floor, SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(round, SCALAR, OPERATOR , Layout) \
- TEST_UNARY_BUILTINS_FOR_SCALAR(log1p, SCALAR, OPERATOR , Layout)
-
-#define TEST_IS_THAT_RETURNS_BOOL(SCALAR, FUNC, Layout) \
- { \
- /* out = in.FUNC() */ \
- Tensor<SCALAR, 3, Layout, int64_t> in(tensorRange); \
- Tensor<bool, 3, Layout, int64_t> out(tensorRange); \
- in = in.random() + static_cast<SCALAR>(0.01); \
- SCALAR *gpu_data = static_cast<SCALAR *>( \
- sycl_device.allocate(in.size() * sizeof(SCALAR))); \
- bool *gpu_data_out = \
- static_cast<bool *>(sycl_device.allocate(out.size() * sizeof(bool))); \
- TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu(gpu_data, tensorRange); \
- TensorMap<Tensor<bool, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \
- sycl_device.memcpyHostToDevice(gpu_data, in.data(), \
- (in.size()) * sizeof(SCALAR)); \
- gpu_out.device(sycl_device) = gpu.FUNC(); \
- sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \
- (out.size()) * sizeof(bool)); \
- for (int64_t i = 0; i < out.size(); ++i) { \
- VERIFY_IS_EQUAL(out(i), std::FUNC(in(i))); \
- } \
- sycl_device.deallocate(gpu_data); \
- sycl_device.deallocate(gpu_data_out); \
- }
-
-#define TEST_UNARY_BUILTINS(SCALAR, Layout) \
- TEST_UNARY_BUILTINS_OPERATOR(SCALAR, +=, Layout) \
- TEST_UNARY_BUILTINS_OPERATOR(SCALAR, =, Layout) \
- TEST_IS_THAT_RETURNS_BOOL(SCALAR, isnan, Layout) \
- TEST_IS_THAT_RETURNS_BOOL(SCALAR, isfinite, Layout) \
- TEST_IS_THAT_RETURNS_BOOL(SCALAR, isinf, Layout)
-
-static void test_builtin_unary_sycl(const Eigen::SyclDevice &sycl_device) {
- int64_t sizeDim1 = 10;
- int64_t sizeDim2 = 10;
- int64_t sizeDim3 = 10;
- array<int64_t, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
-
- TEST_UNARY_BUILTINS(float, RowMajor)
- TEST_UNARY_BUILTINS(float, ColMajor)
-}
-
-namespace std {
-template <typename T> T cwiseMax(T x, T y) { return std::max(x, y); }
-template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
-}
-
-#define TEST_BINARY_BUILTINS_FUNC(SCALAR, FUNC, Layout) \
- { \
- /* out = in_1.FUNC(in_2) */ \
- Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange); \
- Tensor<SCALAR, 3, Layout, int64_t> in_2(tensorRange); \
- Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \
- in_1 = in_1.random() + static_cast<SCALAR>(0.01); \
- in_2 = in_2.random() + static_cast<SCALAR>(0.01); \
- Tensor<SCALAR, 3, Layout, int64_t> reference(out); \
- SCALAR *gpu_data_1 = static_cast<SCALAR *>( \
- sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \
- SCALAR *gpu_data_2 = static_cast<SCALAR *>( \
- sycl_device.allocate(in_2.size() * sizeof(SCALAR))); \
- SCALAR *gpu_data_out = static_cast<SCALAR *>( \
- sycl_device.allocate(out.size() * sizeof(SCALAR))); \
- TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange); \
- TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_2(gpu_data_2, tensorRange); \
- TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \
- sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \
- (in_1.size()) * sizeof(SCALAR)); \
- sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(), \
- (in_2.size()) * sizeof(SCALAR)); \
- gpu_out.device(sycl_device) = gpu_1.FUNC(gpu_2); \
- sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \
- (out.size()) * sizeof(SCALAR)); \
- for (int64_t i = 0; i < out.size(); ++i) { \
- SCALAR ver = reference(i); \
- ver = std::FUNC(in_1(i), in_2(i)); \
- VERIFY_IS_APPROX(out(i), ver); \
- } \
- sycl_device.deallocate(gpu_data_1); \
- sycl_device.deallocate(gpu_data_2); \
- sycl_device.deallocate(gpu_data_out); \
- }
-
-#define TEST_BINARY_BUILTINS_OPERATORS(SCALAR, OPERATOR, Layout) \
- { \
- /* out = in_1 OPERATOR in_2 */ \
- Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange); \
- Tensor<SCALAR, 3, Layout, int64_t> in_2(tensorRange); \
- Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \
- in_1 = in_1.random() + static_cast<SCALAR>(0.01); \
- in_2 = in_2.random() + static_cast<SCALAR>(0.01); \
- Tensor<SCALAR, 3, Layout, int64_t> reference(out); \
- SCALAR *gpu_data_1 = static_cast<SCALAR *>( \
- sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \
- SCALAR *gpu_data_2 = static_cast<SCALAR *>( \
- sycl_device.allocate(in_2.size() * sizeof(SCALAR))); \
- SCALAR *gpu_data_out = static_cast<SCALAR *>( \
- sycl_device.allocate(out.size() * sizeof(SCALAR))); \
- TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange); \
- TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_2(gpu_data_2, tensorRange); \
- TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \
- sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \
- (in_1.size()) * sizeof(SCALAR)); \
- sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(), \
- (in_2.size()) * sizeof(SCALAR)); \
- gpu_out.device(sycl_device) = gpu_1 OPERATOR gpu_2; \
- sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \
- (out.size()) * sizeof(SCALAR)); \
- for (int64_t i = 0; i < out.size(); ++i) { \
- VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR in_2(i)); \
- } \
- sycl_device.deallocate(gpu_data_1); \
- sycl_device.deallocate(gpu_data_2); \
- sycl_device.deallocate(gpu_data_out); \
- }
-
-#define TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(SCALAR, OPERATOR, Layout) \
- { \
- /* out = in_1 OPERATOR 2 */ \
- Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange); \
- Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange); \
- in_1 = in_1.random() + static_cast<SCALAR>(0.01); \
- Tensor<SCALAR, 3, Layout, int64_t> reference(out); \
- SCALAR *gpu_data_1 = static_cast<SCALAR *>( \
- sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \
- SCALAR *gpu_data_out = static_cast<SCALAR *>( \
- sycl_device.allocate(out.size() * sizeof(SCALAR))); \
- TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange); \
- TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange); \
- sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \
- (in_1.size()) * sizeof(SCALAR)); \
- gpu_out.device(sycl_device) = gpu_1 OPERATOR 2; \
- sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \
- (out.size()) * sizeof(SCALAR)); \
- for (int64_t i = 0; i < out.size(); ++i) { \
- VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR 2); \
- } \
- sycl_device.deallocate(gpu_data_1); \
- sycl_device.deallocate(gpu_data_out); \
- }
-
-#define TEST_BINARY_BUILTINS(SCALAR, Layout) \
- TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMax , Layout) \
- TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMin , Layout) \
- TEST_BINARY_BUILTINS_OPERATORS(SCALAR, + , Layout) \
- TEST_BINARY_BUILTINS_OPERATORS(SCALAR, - , Layout) \
- TEST_BINARY_BUILTINS_OPERATORS(SCALAR, * , Layout) \
- TEST_BINARY_BUILTINS_OPERATORS(SCALAR, / , Layout)
-
-static void test_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
- int64_t sizeDim1 = 10;
- int64_t sizeDim2 = 10;
- int64_t sizeDim3 = 10;
- array<int64_t, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
- TEST_BINARY_BUILTINS(float, RowMajor)
- TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, RowMajor)
- TEST_BINARY_BUILTINS(float, ColMajor)
- TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, ColMajor)
-}
-
-void test_cxx11_tensor_builtins_sycl() {
- for (const auto& device :Eigen::get_sycl_supported_devices()) {
- QueueInterface queueInterface(device);
- Eigen::SyclDevice sycl_device(&queueInterface);
- CALL_SUBTEST(test_builtin_unary_sycl(sycl_device));
- CALL_SUBTEST(test_builtin_binary_sycl(sycl_device));
- }
-}
diff --git a/eigen/unsupported/test/cxx11_tensor_chipping.cpp b/eigen/unsupported/test/cxx11_tensor_chipping.cpp
index 89cf5c7..1832dec 100644
--- a/eigen/unsupported/test/cxx11_tensor_chipping.cpp
+++ b/eigen/unsupported/test/cxx11_tensor_chipping.cpp
@@ -43,7 +43,7 @@ static void test_simple_chip()
VERIFY_IS_EQUAL(chip2.dimension(2), 7);
VERIFY_IS_EQUAL(chip2.dimension(3), 11);
for (int i = 0; i < 2; ++i) {
- for (int j = 0; j < 5; ++j) {
+ for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 7; ++k) {
for (int l = 0; l < 11; ++l) {
VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l));
@@ -75,7 +75,7 @@ static void test_simple_chip()
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 5; ++k) {
- for (int l = 0; l < 11; ++l) {
+ for (int l = 0; l < 7; ++l) {
VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l));
}
}
@@ -126,7 +126,7 @@ static void test_dynamic_chip()
VERIFY_IS_EQUAL(chip2.dimension(2), 7);
VERIFY_IS_EQUAL(chip2.dimension(3), 11);
for (int i = 0; i < 2; ++i) {
- for (int j = 0; j < 5; ++j) {
+ for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 7; ++k) {
for (int l = 0; l < 11; ++l) {
VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l));
@@ -158,7 +158,7 @@ static void test_dynamic_chip()
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 5; ++k) {
- for (int l = 0; l < 11; ++l) {
+ for (int l = 0; l < 7; ++l) {
VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l));
}
}
diff --git a/eigen/unsupported/test/cxx11_tensor_chipping_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_chipping_sycl.cpp
deleted file mode 100644
index 39e4f0a..0000000
--- a/eigen/unsupported/test/cxx11_tensor_chipping_sycl.cpp
+++ /dev/null
@@ -1,622 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016
-// Mehdi Goli Codeplay Software Ltd.
-// Ralph Potter Codeplay Software Ltd.
-// Luke Iwanski Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-// Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_chipping_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
-#define EIGEN_USE_SYCL
-
-#include "main.h"
-
-#include <Eigen/CXX11/Tensor>
-
-using Eigen::Tensor;
-
-template <typename DataType, int DataLayout, typename IndexType>
-static void test_static_chip_sycl(const Eigen::SyclDevice& sycl_device)
-{
- IndexType sizeDim1 = 2;
- IndexType sizeDim2 = 3;
- IndexType sizeDim3 = 5;
- IndexType sizeDim4 = 7;
- IndexType sizeDim5 = 11;
-
- array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
- array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
-
- Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange);
- Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange);
-
- tensor.setRandom();
-
- const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
- const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType);
- DataType* gpu_data_tensor = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
- DataType* gpu_data_chip1 = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize));
-
- TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
- TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange);
-
- sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
- gpu_chip1.device(sycl_device)=gpu_tensor.template chip<0l>(1l);
- sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize);
-
- VERIFY_IS_EQUAL(chip1.dimension(0), sizeDim2);
- VERIFY_IS_EQUAL(chip1.dimension(1), sizeDim3);
- VERIFY_IS_EQUAL(chip1.dimension(2), sizeDim4);
- VERIFY_IS_EQUAL(chip1.dimension(3), sizeDim5);
-
- for (IndexType i = 0; i < sizeDim2; ++i) {
- for (IndexType j = 0; j < sizeDim3; ++j) {
- for (IndexType k = 0; k < sizeDim4; ++k) {
- for (IndexType l = 0; l < sizeDim5; ++l) {
- VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1l,i,j,k,l));
- }
- }
- }
- }
-
- array<IndexType, 4> chip2TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}};
- Tensor<DataType, 4, DataLayout,IndexType> chip2(chip2TensorRange);
- const size_t chip2TensorBuffSize =chip2.size()*sizeof(DataType);
- DataType* gpu_data_chip2 = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize));
- TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange);
-
- gpu_chip2.device(sycl_device)=gpu_tensor.template chip<1l>(1l);
- sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize);
-
- VERIFY_IS_EQUAL(chip2.dimension(0), sizeDim1);
- VERIFY_IS_EQUAL(chip2.dimension(1), sizeDim3);
- VERIFY_IS_EQUAL(chip2.dimension(2), sizeDim4);
- VERIFY_IS_EQUAL(chip2.dimension(3), sizeDim5);
-
- for (IndexType i = 0; i < sizeDim1; ++i) {
- for (IndexType j = 0; j < sizeDim3; ++j) {
- for (IndexType k = 0; k < sizeDim4; ++k) {
- for (IndexType l = 0; l < sizeDim5; ++l) {
- VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1l,j,k,l));
- }
- }
- }
- }
-
- array<IndexType, 4> chip3TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}};
- Tensor<DataType, 4, DataLayout,IndexType> chip3(chip3TensorRange);
- const size_t chip3TensorBuffSize =chip3.size()*sizeof(DataType);
- DataType* gpu_data_chip3 = static_cast<DataType*>(sycl_device.allocate(chip3TensorBuffSize));
- TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip3(gpu_data_chip3, chip3TensorRange);
-
- gpu_chip3.device(sycl_device)=gpu_tensor.template chip<2l>(2l);
- sycl_device.memcpyDeviceToHost(chip3.data(), gpu_data_chip3, chip3TensorBuffSize);
-
- VERIFY_IS_EQUAL(chip3.dimension(0), sizeDim1);
- VERIFY_IS_EQUAL(chip3.dimension(1), sizeDim2);
- VERIFY_IS_EQUAL(chip3.dimension(2), sizeDim4);
- VERIFY_IS_EQUAL(chip3.dimension(3), sizeDim5);
-
- for (IndexType i = 0; i < sizeDim1; ++i) {
- for (IndexType j = 0; j < sizeDim2; ++j) {
- for (IndexType k = 0; k < sizeDim4; ++k) {
- for (IndexType l = 0; l < sizeDim5; ++l) {
- VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2l,k,l));
- }
- }
- }
- }
-
- array<IndexType, 4> chip4TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}};
- Tensor<DataType, 4, DataLayout,IndexType> chip4(chip4TensorRange);
- const size_t chip4TensorBuffSize =chip4.size()*sizeof(DataType);
- DataType* gpu_data_chip4 = static_cast<DataType*>(sycl_device.allocate(chip4TensorBuffSize));
- TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip4(gpu_data_chip4, chip4TensorRange);
-
- gpu_chip4.device(sycl_device)=gpu_tensor.template chip<3l>(5l);
- sycl_device.memcpyDeviceToHost(chip4.data(), gpu_data_chip4, chip4TensorBuffSize);
-
- VERIFY_IS_EQUAL(chip4.dimension(0), sizeDim1);
- VERIFY_IS_EQUAL(chip4.dimension(1), sizeDim2);
- VERIFY_IS_EQUAL(chip4.dimension(2), sizeDim3);
- VERIFY_IS_EQUAL(chip4.dimension(3), sizeDim5);
-
- for (IndexType i = 0; i < sizeDim1; ++i) {
- for (IndexType j = 0; j < sizeDim2; ++j) {
- for (IndexType k = 0; k < sizeDim3; ++k) {
- for (IndexType l = 0; l < sizeDim5; ++l) {
- VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5l,l));
- }
- }
- }
- }
-
-
- array<IndexType, 4> chip5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
- Tensor<DataType, 4, DataLayout,IndexType> chip5(chip5TensorRange);
- const size_t chip5TensorBuffSize =chip5.size()*sizeof(DataType);
- DataType* gpu_data_chip5 = static_cast<DataType*>(sycl_device.allocate(chip5TensorBuffSize));
- TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip5(gpu_data_chip5, chip5TensorRange);
-
- gpu_chip5.device(sycl_device)=gpu_tensor.template chip<4l>(7l);
- sycl_device.memcpyDeviceToHost(chip5.data(), gpu_data_chip5, chip5TensorBuffSize);
-
- VERIFY_IS_EQUAL(chip5.dimension(0), sizeDim1);
- VERIFY_IS_EQUAL(chip5.dimension(1), sizeDim2);
- VERIFY_IS_EQUAL(chip5.dimension(2), sizeDim3);
- VERIFY_IS_EQUAL(chip5.dimension(3), sizeDim4);
-
- for (IndexType i = 0; i < sizeDim1; ++i) {
- for (IndexType j = 0; j < sizeDim2; ++j) {
- for (IndexType k = 0; k < sizeDim3; ++k) {
- for (IndexType l = 0; l < sizeDim4; ++l) {
- VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7l));
- }
- }
- }
- }
-
- sycl_device.deallocate(gpu_data_tensor);
- sycl_device.deallocate(gpu_data_chip1);
- sycl_device.deallocate(gpu_data_chip2);
- sycl_device.deallocate(gpu_data_chip3);
- sycl_device.deallocate(gpu_data_chip4);
- sycl_device.deallocate(gpu_data_chip5);
-}
-
-template <typename DataType, int DataLayout, typename IndexType>
-static void test_dynamic_chip_sycl(const Eigen::SyclDevice& sycl_device)
-{
- IndexType sizeDim1 = 2;
- IndexType sizeDim2 = 3;
- IndexType sizeDim3 = 5;
- IndexType sizeDim4 = 7;
- IndexType sizeDim5 = 11;
-
- array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
- array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
-
- Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange);
- Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange);
-
- tensor.setRandom();
-
- const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
- const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType);
- DataType* gpu_data_tensor = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
- DataType* gpu_data_chip1 = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize));
-
- TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
- TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange);
-
- sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
- gpu_chip1.device(sycl_device)=gpu_tensor.chip(1l,0l);
- sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize);
-
- VERIFY_IS_EQUAL(chip1.dimension(0), sizeDim2);
- VERIFY_IS_EQUAL(chip1.dimension(1), sizeDim3);
- VERIFY_IS_EQUAL(chip1.dimension(2), sizeDim4);
- VERIFY_IS_EQUAL(chip1.dimension(3), sizeDim5);
-
- for (IndexType i = 0; i < sizeDim2; ++i) {
- for (IndexType j = 0; j < sizeDim3; ++j) {
- for (IndexType k = 0; k < sizeDim4; ++k) {
- for (IndexType l = 0; l < sizeDim5; ++l) {
- VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1l,i,j,k,l));
- }
- }
- }
- }
-
- array<IndexType, 4> chip2TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}};
- Tensor<DataType, 4, DataLayout,IndexType> chip2(chip2TensorRange);
- const size_t chip2TensorBuffSize =chip2.size()*sizeof(DataType);
- DataType* gpu_data_chip2 = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize));
- TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange);
-
- gpu_chip2.device(sycl_device)=gpu_tensor.chip(1l,1l);
- sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize);
-
- VERIFY_IS_EQUAL(chip2.dimension(0), sizeDim1);
- VERIFY_IS_EQUAL(chip2.dimension(1), sizeDim3);
- VERIFY_IS_EQUAL(chip2.dimension(2), sizeDim4);
- VERIFY_IS_EQUAL(chip2.dimension(3), sizeDim5);
-
- for (IndexType i = 0; i < sizeDim1; ++i) {
- for (IndexType j = 0; j < sizeDim3; ++j) {
- for (IndexType k = 0; k < sizeDim4; ++k) {
- for (IndexType l = 0; l < sizeDim5; ++l) {
- VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1l,j,k,l));
- }
- }
- }
- }
-
- array<IndexType, 4> chip3TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}};
- Tensor<DataType, 4, DataLayout,IndexType> chip3(chip3TensorRange);
- const size_t chip3TensorBuffSize =chip3.size()*sizeof(DataType);
- DataType* gpu_data_chip3 = static_cast<DataType*>(sycl_device.allocate(chip3TensorBuffSize));
- TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip3(gpu_data_chip3, chip3TensorRange);
-
- gpu_chip3.device(sycl_device)=gpu_tensor.chip(2l,2l);
- sycl_device.memcpyDeviceToHost(chip3.data(), gpu_data_chip3, chip3TensorBuffSize);
-
- VERIFY_IS_EQUAL(chip3.dimension(0), sizeDim1);
- VERIFY_IS_EQUAL(chip3.dimension(1), sizeDim2);
- VERIFY_IS_EQUAL(chip3.dimension(2), sizeDim4);
- VERIFY_IS_EQUAL(chip3.dimension(3), sizeDim5);
-
- for (IndexType i = 0; i < sizeDim1; ++i) {
- for (IndexType j = 0; j < sizeDim2; ++j) {
- for (IndexType k = 0; k < sizeDim4; ++k) {
- for (IndexType l = 0; l < sizeDim5; ++l) {
- VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2l,k,l));
- }
- }
- }
- }
-
- array<IndexType, 4> chip4TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}};
- Tensor<DataType, 4, DataLayout,IndexType> chip4(chip4TensorRange);
- const size_t chip4TensorBuffSize =chip4.size()*sizeof(DataType);
- DataType* gpu_data_chip4 = static_cast<DataType*>(sycl_device.allocate(chip4TensorBuffSize));
- TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip4(gpu_data_chip4, chip4TensorRange);
-
- gpu_chip4.device(sycl_device)=gpu_tensor.chip(5l,3l);
- sycl_device.memcpyDeviceToHost(chip4.data(), gpu_data_chip4, chip4TensorBuffSize);
-
- VERIFY_IS_EQUAL(chip4.dimension(0), sizeDim1);
- VERIFY_IS_EQUAL(chip4.dimension(1), sizeDim2);
- VERIFY_IS_EQUAL(chip4.dimension(2), sizeDim3);
- VERIFY_IS_EQUAL(chip4.dimension(3), sizeDim5);
-
- for (IndexType i = 0; i < sizeDim1; ++i) {
- for (IndexType j = 0; j < sizeDim2; ++j) {
- for (IndexType k = 0; k < sizeDim3; ++k) {
- for (IndexType l = 0; l < sizeDim5; ++l) {
- VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5l,l));
- }
- }
- }
- }
-
-
- array<IndexType, 4> chip5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
- Tensor<DataType, 4, DataLayout,IndexType> chip5(chip5TensorRange);
- const size_t chip5TensorBuffSize =chip5.size()*sizeof(DataType);
- DataType* gpu_data_chip5 = static_cast<DataType*>(sycl_device.allocate(chip5TensorBuffSize));
- TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip5(gpu_data_chip5, chip5TensorRange);
-
- gpu_chip5.device(sycl_device)=gpu_tensor.chip(7l,4l);
- sycl_device.memcpyDeviceToHost(chip5.data(), gpu_data_chip5, chip5TensorBuffSize);
-
- VERIFY_IS_EQUAL(chip5.dimension(0), sizeDim1);
- VERIFY_IS_EQUAL(chip5.dimension(1), sizeDim2);
- VERIFY_IS_EQUAL(chip5.dimension(2), sizeDim3);
- VERIFY_IS_EQUAL(chip5.dimension(3), sizeDim4);
-
- for (IndexType i = 0; i < sizeDim1; ++i) {
- for (IndexType j = 0; j < sizeDim2; ++j) {
- for (IndexType k = 0; k < sizeDim3; ++k) {
- for (IndexType l = 0; l < sizeDim4; ++l) {
- VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7l));
- }
- }
- }
- }
- sycl_device.deallocate(gpu_data_tensor);
- sycl_device.deallocate(gpu_data_chip1);
- sycl_device.deallocate(gpu_data_chip2);
- sycl_device.deallocate(gpu_data_chip3);
- sycl_device.deallocate(gpu_data_chip4);
- sycl_device.deallocate(gpu_data_chip5);
-}
-
-template <typename DataType, int DataLayout, typename IndexType>
-static void test_chip_in_expr(const Eigen::SyclDevice& sycl_device) {
-
- IndexType sizeDim1 = 2;
- IndexType sizeDim2 = 3;
- IndexType sizeDim3 = 5;
- IndexType sizeDim4 = 7;
- IndexType sizeDim5 = 11;
-
- array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
- array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
-
- Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange);
-
- Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange);
- Tensor<DataType, 4, DataLayout,IndexType> tensor1(chip1TensorRange);
- tensor.setRandom();
- tensor1.setRandom();
-
- const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
- const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType);
- DataType* gpu_data_tensor = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
- DataType* gpu_data_chip1 = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize));
- DataType* gpu_data_tensor1 = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize));
-
- TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
- TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange);
- TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_tensor1(gpu_data_tensor1, chip1TensorRange);
-
-
- sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize);
- sycl_device.memcpyHostToDevice(gpu_data_tensor1, tensor1.data(), chip1TensorBuffSize);
- gpu_chip1.device(sycl_device)=gpu_tensor.template chip<0l>(0l) + gpu_tensor1;
- sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize);
-
- for (int i = 0; i < sizeDim2; ++i) {
- for (int j = 0; j < sizeDim3; ++j) {
- for (int k = 0; k < sizeDim4; ++k) {
- for (int l = 0; l < sizeDim5; ++l) {
- float expected = tensor(0l,i,j,k,l) + tensor1(i,j,k,l);
- VERIFY_IS_EQUAL(chip1(i,j,k,l), expected);
- }
- }
- }
- }
-
- array<IndexType, 3> chip2TensorRange = {{sizeDim2, sizeDim4, sizeDim5}};
- Tensor<DataType, 3, DataLayout,IndexType> tensor2(chip2TensorRange);
- Tensor<DataType, 3, DataLayout,IndexType> chip2(chip2TensorRange);
- tensor2.setRandom();
- const size_t chip2TensorBuffSize =tensor2.size()*sizeof(DataType);
- DataType* gpu_data_tensor2 = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize));
- DataType* gpu_data_chip2 = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize));
- TensorMap<Tensor<DataType, 3, DataLayout,IndexType>> gpu_tensor2(gpu_data_tensor2, chip2TensorRange);
- TensorMap<Tensor<DataType, 3, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange);
-
- sycl_device.memcpyHostToDevice(gpu_data_tensor2, tensor2.data(), chip2TensorBuffSize);
- gpu_chip2.device(sycl_device)=gpu_tensor.template chip<0l>(0l).template chip<1l>(2l) + gpu_tensor2;
- sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize);
-
- for (int i = 0; i < sizeDim2; ++i) {
- for (int j = 0; j < sizeDim4; ++j) {
- for (int k = 0; k < sizeDim5; ++k) {
- float expected = tensor(0l,i,2l,j,k) + tensor2(i,j,k);
- VERIFY_IS_EQUAL(chip2(i,j,k), expected);
- }
- }
- }
- sycl_device.deallocate(gpu_data_tensor);
- sycl_device.deallocate(gpu_data_tensor1);
- sycl_device.deallocate(gpu_data_chip1);
- sycl_device.deallocate(gpu_data_tensor2);
- sycl_device.deallocate(gpu_data_chip2);
-}
-
-template <typename DataType, int DataLayout, typename IndexType>
-static void test_chip_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device)
-{
-
- IndexType sizeDim1 = 2;
- IndexType sizeDim2 = 3;
- IndexType sizeDim3 = 5;
- IndexType sizeDim4 = 7;
- IndexType sizeDim5 = 11;
-
- array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
- array<IndexType, 4> input2TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
-
- Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange);
- Tensor<DataType, 5, DataLayout,IndexType> input1(tensorRange);
- Tensor<DataType, 4, DataLayout,IndexType> input2(input2TensorRange);
- input1.setRandom();
- input2.setRandom();
-
-
- const size_t tensorBuffSize =tensor.size()*sizeof(DataType);
- const size_t input2TensorBuffSize =input2.size()*sizeof(DataType);
- DataType* gpu_data_tensor = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
- DataType* gpu_data_input1 = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
- DataType* gpu_data_input2 = static_cast<DataType*>(sycl_device.allocate(input2TensorBuffSize));
-
- TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange);
- TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_input1(gpu_data_input1, tensorRange);
- TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input2(gpu_data_input2, input2TensorRange);
-
- sycl_device.memcpyHostToDevice(gpu_data_input1, input1.data(), tensorBuffSize);
- gpu_tensor.device(sycl_device)=gpu_input1;
- sycl_device.memcpyHostToDevice(gpu_data_input2, input2.data(), input2TensorBuffSize);
- gpu_tensor.template chip<0l>(1l).device(sycl_device)=gpu_input2;
- sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
-
- for (int i = 0; i < sizeDim1; ++i) {
- for (int j = 0; j < sizeDim2; ++j) {
- for (int k = 0; k < sizeDim3; ++k) {
- for (int l = 0; l < sizeDim4; ++l) {
- for (int m = 0; m < sizeDim5; ++m) {
- if (i != 1) {
- VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
- } else {
- VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input2(j,k,l,m));
- }
- }
- }
- }
- }
- }
-
- gpu_tensor.device(sycl_device)=gpu_input1;
- array<IndexType, 4> input3TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}};
- Tensor<DataType, 4, DataLayout,IndexType> input3(input3TensorRange);
- input3.setRandom();
-
- const size_t input3TensorBuffSize =input3.size()*sizeof(DataType);
- DataType* gpu_data_input3 = static_cast<DataType*>(sycl_device.allocate(input3TensorBuffSize));
- TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input3(gpu_data_input3, input3TensorRange);
-
- sycl_device.memcpyHostToDevice(gpu_data_input3, input3.data(), input3TensorBuffSize);
- gpu_tensor.template chip<1l>(1l).device(sycl_device)=gpu_input3;
- sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
-
- for (int i = 0; i < sizeDim1; ++i) {
- for (int j = 0; j < sizeDim2; ++j) {
- for (int k = 0; k <sizeDim3; ++k) {
- for (int l = 0; l < sizeDim4; ++l) {
- for (int m = 0; m < sizeDim5; ++m) {
- if (j != 1) {
- VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
- } else {
- VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input3(i,k,l,m));
- }
- }
- }
- }
- }
- }
-
- gpu_tensor.device(sycl_device)=gpu_input1;
- array<IndexType, 4> input4TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}};
- Tensor<DataType, 4, DataLayout,IndexType> input4(input4TensorRange);
- input4.setRandom();
-
- const size_t input4TensorBuffSize =input4.size()*sizeof(DataType);
- DataType* gpu_data_input4 = static_cast<DataType*>(sycl_device.allocate(input4TensorBuffSize));
- TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input4(gpu_data_input4, input4TensorRange);
-
- sycl_device.memcpyHostToDevice(gpu_data_input4, input4.data(), input4TensorBuffSize);
- gpu_tensor.template chip<2l>(3l).device(sycl_device)=gpu_input4;
- sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
-
- for (int i = 0; i < sizeDim1; ++i) {
- for (int j = 0; j < sizeDim2; ++j) {
- for (int k = 0; k <sizeDim3; ++k) {
- for (int l = 0; l < sizeDim4; ++l) {
- for (int m = 0; m < sizeDim5; ++m) {
- if (k != 3) {
- VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
- } else {
- VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input4(i,j,l,m));
- }
- }
- }
- }
- }
- }
-
- gpu_tensor.device(sycl_device)=gpu_input1;
- array<IndexType, 4> input5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}};
- Tensor<DataType, 4, DataLayout,IndexType> input5(input5TensorRange);
- input5.setRandom();
-
- const size_t input5TensorBuffSize =input5.size()*sizeof(DataType);
- DataType* gpu_data_input5 = static_cast<DataType*>(sycl_device.allocate(input5TensorBuffSize));
- TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input5(gpu_data_input5, input5TensorRange);
-
- sycl_device.memcpyHostToDevice(gpu_data_input5, input5.data(), input5TensorBuffSize);
- gpu_tensor.template chip<3l>(4l).device(sycl_device)=gpu_input5;
- sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
-
- for (int i = 0; i < sizeDim1; ++i) {
- for (int j = 0; j < sizeDim2; ++j) {
- for (int k = 0; k <sizeDim3; ++k) {
- for (int l = 0; l < sizeDim4; ++l) {
- for (int m = 0; m < sizeDim5; ++m) {
- if (l != 4) {
- VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
- } else {
- VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input5(i,j,k,m));
- }
- }
- }
- }
- }
- }
- gpu_tensor.device(sycl_device)=gpu_input1;
- array<IndexType, 4> input6TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
- Tensor<DataType, 4, DataLayout,IndexType> input6(input6TensorRange);
- input6.setRandom();
-
- const size_t input6TensorBuffSize =input6.size()*sizeof(DataType);
- DataType* gpu_data_input6 = static_cast<DataType*>(sycl_device.allocate(input6TensorBuffSize));
- TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input6(gpu_data_input6, input6TensorRange);
-
- sycl_device.memcpyHostToDevice(gpu_data_input6, input6.data(), input6TensorBuffSize);
- gpu_tensor.template chip<4l>(5l).device(sycl_device)=gpu_input6;
- sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
-
- for (int i = 0; i < sizeDim1; ++i) {
- for (int j = 0; j < sizeDim2; ++j) {
- for (int k = 0; k <sizeDim3; ++k) {
- for (int l = 0; l < sizeDim4; ++l) {
- for (int m = 0; m < sizeDim5; ++m) {
- if (m != 5) {
- VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
- } else {
- VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input6(i,j,k,l));
- }
- }
- }
- }
- }
- }
-
-
- gpu_tensor.device(sycl_device)=gpu_input1;
- Tensor<DataType, 5, DataLayout,IndexType> input7(tensorRange);
- input7.setRandom();
-
- DataType* gpu_data_input7 = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize));
- TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_input7(gpu_data_input7, tensorRange);
-
- sycl_device.memcpyHostToDevice(gpu_data_input7, input7.data(), tensorBuffSize);
- gpu_tensor.chip(0l,0l).device(sycl_device)=gpu_input7.chip(0l,0l);
- sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize);
-
- for (int i = 0; i < sizeDim1; ++i) {
- for (int j = 0; j < sizeDim2; ++j) {
- for (int k = 0; k <sizeDim3; ++k) {
- for (int l = 0; l < sizeDim4; ++l) {
- for (int m = 0; m < sizeDim5; ++m) {
- if (i != 0) {
- VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
- } else {
- VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input7(i,j,k,l,m));
- }
- }
- }
- }
- }
- }
- sycl_device.deallocate(gpu_data_tensor);
- sycl_device.deallocate(gpu_data_input1);
- sycl_device.deallocate(gpu_data_input2);
- sycl_device.deallocate(gpu_data_input3);
- sycl_device.deallocate(gpu_data_input4);
- sycl_device.deallocate(gpu_data_input5);
- sycl_device.deallocate(gpu_data_input6);
- sycl_device.deallocate(gpu_data_input7);
-
-}
-
-template<typename DataType, typename dev_Selector> void sycl_chipping_test_per_device(dev_Selector s){
- QueueInterface queueInterface(s);
- auto sycl_device = Eigen::SyclDevice(&queueInterface);
- test_static_chip_sycl<DataType, RowMajor, int64_t>(sycl_device);
- test_static_chip_sycl<DataType, ColMajor, int64_t>(sycl_device);
- test_dynamic_chip_sycl<DataType, RowMajor, int64_t>(sycl_device);
- test_dynamic_chip_sycl<DataType, ColMajor, int64_t>(sycl_device);
- test_chip_in_expr<DataType, RowMajor, int64_t>(sycl_device);
- test_chip_in_expr<DataType, ColMajor, int64_t>(sycl_device);
- test_chip_as_lvalue_sycl<DataType, RowMajor, int64_t>(sycl_device);
- test_chip_as_lvalue_sycl<DataType, ColMajor, int64_t>(sycl_device);
-}
-void test_cxx11_tensor_chipping_sycl()
-{
- for (const auto& device :Eigen::get_sycl_supported_devices()) {
- CALL_SUBTEST(sycl_chipping_test_per_device<float>(device));
- }
-}
diff --git a/eigen/unsupported/test/cxx11_tensor_concatenation_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
deleted file mode 100644
index e3023a3..0000000
--- a/eigen/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016
-// Mehdi Goli Codeplay Software Ltd.
-// Ralph Potter Codeplay Software Ltd.
-// Luke Iwanski Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_concatenation_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
-#define EIGEN_USE_SYCL
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-using Eigen::Tensor;
-
-template<typename DataType, int DataLayout, typename IndexType>
-static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device)
-{
- IndexType leftDim1 = 2;
- IndexType leftDim2 = 3;
- IndexType leftDim3 = 1;
- Eigen::array<IndexType, 3> leftRange = {{leftDim1, leftDim2, leftDim3}};
- IndexType rightDim1 = 2;
- IndexType rightDim2 = 3;
- IndexType rightDim3 = 1;
- Eigen::array<IndexType, 3> rightRange = {{rightDim1, rightDim2, rightDim3}};
-
- //IndexType concatDim1 = 3;
-// IndexType concatDim2 = 3;
-// IndexType concatDim3 = 1;
- //Eigen::array<IndexType, 3> concatRange = {{concatDim1, concatDim2, concatDim3}};
-
- Tensor<DataType, 3, DataLayout, IndexType> left(leftRange);
- Tensor<DataType, 3, DataLayout, IndexType> right(rightRange);
- left.setRandom();
- right.setRandom();
-
- DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType)));
- DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType)));
-
- Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, leftRange);
- Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, rightRange);
- sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType));
- sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType));
- ///
- Tensor<DataType, 3, DataLayout, IndexType> concatenation1(leftDim1+rightDim1, leftDim2, leftDim3);
- DataType * gpu_out_data1 = static_cast<DataType*>(sycl_device.allocate(concatenation1.dimensions().TotalSize()*sizeof(DataType)));
- Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out1(gpu_out_data1, concatenation1.dimensions());
-
- //concatenation = left.concatenate(right, 0);
- gpu_out1.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 0);
- sycl_device.memcpyDeviceToHost(concatenation1.data(), gpu_out_data1,(concatenation1.dimensions().TotalSize())*sizeof(DataType));
-
- VERIFY_IS_EQUAL(concatenation1.dimension(0), 4);
- VERIFY_IS_EQUAL(concatenation1.dimension(1), 3);
- VERIFY_IS_EQUAL(concatenation1.dimension(2), 1);
- for (IndexType j = 0; j < 3; ++j) {
- for (IndexType i = 0; i < 2; ++i) {
- VERIFY_IS_EQUAL(concatenation1(i, j, 0), left(i, j, 0));
- }
- for (IndexType i = 2; i < 4; ++i) {
- VERIFY_IS_EQUAL(concatenation1(i, j, 0), right(i - 2, j, 0));
- }
- }
-
- sycl_device.deallocate(gpu_out_data1);
- Tensor<DataType, 3, DataLayout, IndexType> concatenation2(leftDim1, leftDim2 +rightDim2, leftDim3);
- DataType * gpu_out_data2 = static_cast<DataType*>(sycl_device.allocate(concatenation2.dimensions().TotalSize()*sizeof(DataType)));
- Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out2(gpu_out_data2, concatenation2.dimensions());
- gpu_out2.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 1);
- sycl_device.memcpyDeviceToHost(concatenation2.data(), gpu_out_data2,(concatenation2.dimensions().TotalSize())*sizeof(DataType));
-
- //concatenation = left.concatenate(right, 1);
- VERIFY_IS_EQUAL(concatenation2.dimension(0), 2);
- VERIFY_IS_EQUAL(concatenation2.dimension(1), 6);
- VERIFY_IS_EQUAL(concatenation2.dimension(2), 1);
- for (IndexType i = 0; i < 2; ++i) {
- for (IndexType j = 0; j < 3; ++j) {
- VERIFY_IS_EQUAL(concatenation2(i, j, 0), left(i, j, 0));
- }
- for (IndexType j = 3; j < 6; ++j) {
- VERIFY_IS_EQUAL(concatenation2(i, j, 0), right(i, j - 3, 0));
- }
- }
- sycl_device.deallocate(gpu_out_data2);
- Tensor<DataType, 3, DataLayout, IndexType> concatenation3(leftDim1, leftDim2, leftDim3+rightDim3);
- DataType * gpu_out_data3 = static_cast<DataType*>(sycl_device.allocate(concatenation3.dimensions().TotalSize()*sizeof(DataType)));
- Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out3(gpu_out_data3, concatenation3.dimensions());
- gpu_out3.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 2);
- sycl_device.memcpyDeviceToHost(concatenation3.data(), gpu_out_data3,(concatenation3.dimensions().TotalSize())*sizeof(DataType));
-
- //concatenation = left.concatenate(right, 2);
- VERIFY_IS_EQUAL(concatenation3.dimension(0), 2);
- VERIFY_IS_EQUAL(concatenation3.dimension(1), 3);
- VERIFY_IS_EQUAL(concatenation3.dimension(2), 2);
- for (IndexType i = 0; i < 2; ++i) {
- for (IndexType j = 0; j < 3; ++j) {
- VERIFY_IS_EQUAL(concatenation3(i, j, 0), left(i, j, 0));
- VERIFY_IS_EQUAL(concatenation3(i, j, 1), right(i, j, 0));
- }
- }
- sycl_device.deallocate(gpu_out_data3);
- sycl_device.deallocate(gpu_in1_data);
- sycl_device.deallocate(gpu_in2_data);
-}
-template<typename DataType, int DataLayout, typename IndexType>
-static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device)
-{
-
- IndexType leftDim1 = 2;
- IndexType leftDim2 = 3;
- Eigen::array<IndexType, 2> leftRange = {{leftDim1, leftDim2}};
-
- IndexType rightDim1 = 2;
- IndexType rightDim2 = 3;
- Eigen::array<IndexType, 2> rightRange = {{rightDim1, rightDim2}};
-
- IndexType concatDim1 = 4;
- IndexType concatDim2 = 3;
- Eigen::array<IndexType, 2> resRange = {{concatDim1, concatDim2}};
-
- Tensor<DataType, 2, DataLayout, IndexType> left(leftRange);
- Tensor<DataType, 2, DataLayout, IndexType> right(rightRange);
- Tensor<DataType, 2, DataLayout, IndexType> result(resRange);
-
- left.setRandom();
- right.setRandom();
- result.setRandom();
-
- DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType)));
- DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType)));
- DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType)));
-
-
- Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_in1(gpu_in1_data, leftRange);
- Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_in2(gpu_in2_data, rightRange);
- Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(gpu_out_data, resRange);
-
- sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType));
- sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType));
- sycl_device.memcpyHostToDevice(gpu_out_data, result.data(),(result.dimensions().TotalSize())*sizeof(DataType));
-
-// t1.concatenate(t2, 0) = result;
- gpu_in1.concatenate(gpu_in2, 0).device(sycl_device) =gpu_out;
- sycl_device.memcpyDeviceToHost(left.data(), gpu_in1_data,(left.dimensions().TotalSize())*sizeof(DataType));
- sycl_device.memcpyDeviceToHost(right.data(), gpu_in2_data,(right.dimensions().TotalSize())*sizeof(DataType));
-
- for (IndexType i = 0; i < 2; ++i) {
- for (IndexType j = 0; j < 3; ++j) {
- VERIFY_IS_EQUAL(left(i, j), result(i, j));
- VERIFY_IS_EQUAL(right(i, j), result(i+2, j));
- }
- }
- sycl_device.deallocate(gpu_in1_data);
- sycl_device.deallocate(gpu_in2_data);
- sycl_device.deallocate(gpu_out_data);
-}
-
-
-template <typename DataType, typename Dev_selector> void tensorConcat_perDevice(Dev_selector s){
- QueueInterface queueInterface(s);
- auto sycl_device = Eigen::SyclDevice(&queueInterface);
- test_simple_concatenation<DataType, RowMajor, int64_t>(sycl_device);
- test_simple_concatenation<DataType, ColMajor, int64_t>(sycl_device);
- test_concatenation_as_lvalue<DataType, ColMajor, int64_t>(sycl_device);
-}
-void test_cxx11_tensor_concatenation_sycl() {
- for (const auto& device :Eigen::get_sycl_supported_devices()) {
- CALL_SUBTEST(tensorConcat_perDevice<float>(device));
- }
-}
diff --git a/eigen/unsupported/test/cxx11_tensor_contract_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_contract_sycl.cpp
deleted file mode 100644
index 5bace66..0000000
--- a/eigen/unsupported/test/cxx11_tensor_contract_sycl.cpp
+++ /dev/null
@@ -1,290 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016
-// Mehdi Goli Codeplay Software Ltd.
-// Ralph Potter Codeplay Software Ltd.
-// Luke Iwanski Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_contract_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
-#define EIGEN_USE_SYCL
-
-#include <iostream>
-#include <chrono>
-#include <ctime>
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-using Eigen::array;
-using Eigen::SyclDevice;
-using Eigen::Tensor;
-using Eigen::TensorMap;
-template<int DataLayout, typename DataType, typename IndexType, typename Device>
-void static test_sycl_contraction(const Device& sycl_device, IndexType m_size, IndexType k_size, IndexType n_size)
-{
- typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair;
- static const DataType error_threshold =1e-4f;
-// std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
- // with these dimensions, the output has 300 * 140 elements, which is
- // more than 30 * 1024, which is the number of threads in blocks on
- // a 15 SM GK110 GPU
- Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
- Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
- Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size);
- Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(m_size, n_size);
-// Eigen::array<DimPair, 1> dims(DimPair(1, 0));
- Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
- Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
- Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
- Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}};
-
- t_left.setRandom();
- t_right.setRandom();
-
- std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
- std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
- std::size_t t_result_bytes = t_result.size() * sizeof(DataType);
-
- DataType * d_t_left = static_cast<DataType*>(sycl_device.allocate(t_left_bytes));
- DataType * d_t_right = static_cast<DataType*>(sycl_device.allocate(t_right_bytes));
- DataType * d_t_result = static_cast<DataType*>(sycl_device.allocate(t_result_bytes));
-
- Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims);
- Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims);
- Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_result(d_t_result, result_dims);
-
- sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes);
- sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes);
-
- gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
- sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes);
-
- t_result = t_left.contract(t_right, dims);
-
- for (IndexType i = 0; i < t_result.size(); i++) {
- if (static_cast<DataType>(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) {
- continue;
- }
- if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), error_threshold)) {
- continue;
- }
- std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i)
- << " vs " << t_result_gpu(i) << std::endl;
- assert(false);
- }
- sycl_device.deallocate(d_t_left);
- sycl_device.deallocate(d_t_right);
- sycl_device.deallocate(d_t_result);
-}
-
-template<int DataLayout, typename DataType, typename IndexType, typename Device>
-void test_TF(const Device& sycl_device)
-{
- typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair;
- static const DataType error_threshold =1e-4f;
- Eigen::array<IndexType, 2> left_dims = {{2, 3}};
- Eigen::array<IndexType, 2> right_dims = {{3, 1}};
- Eigen::array<IndexType, 2> res_dims = {{2, 1}};
- Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
-
-
- Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
- Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
- Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
- Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);
-
- t_left.data()[0] = 1.0f;
- t_left.data()[1] = 2.0f;
- t_left.data()[2] = 3.0f;
- t_left.data()[3] = 4.0f;
- t_left.data()[4] = 5.0f;
- t_left.data()[5] = 6.0f;
-
- t_right.data()[0] = -1.0f;
- t_right.data()[1] = 0.5f;
- t_right.data()[2] = 2.0f;
-
- std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
- std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
- std::size_t t_result_bytes = t_result.size()*sizeof(DataType);
-
-
- DataType * d_t_left = static_cast<DataType*>(sycl_device.allocate(t_left_bytes));
- DataType * d_t_right = static_cast<DataType*>(sycl_device.allocate(t_right_bytes));
- DataType * d_t_result = static_cast<DataType*>(sycl_device.allocate(t_result_bytes));
-
- Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims);
- Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims);
- Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_result(d_t_result, res_dims);
-
- sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes);
- sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes);
-
- gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
- sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes);
-
- t_result = t_left.contract(t_right, dims);
-
- for (IndexType i = 0; i < t_result.size(); i++) {
- if (static_cast<DataType>(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) {
- continue;
- }
- if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), error_threshold)) {
- continue;
- }
- std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i)
- << " vs " << t_result_gpu(i) << std::endl;
- assert(false);
- }
- sycl_device.deallocate(d_t_left);
- sycl_device.deallocate(d_t_right);
- sycl_device.deallocate(d_t_result);
-
-
-}
-
-template<int DataLayout, typename DataType, typename IndexType, typename Device>
-void test_scalar(const Device& sycl_device, IndexType m_size, IndexType k_size, IndexType n_size)
-{
- //std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
- // with these dimensions, the output has 300 * 140 elements, which is
- // more than 30 * 1024, which is the number of threads in blocks on
- // a 15 SM GK110 GPU
- typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair;
- static const DataType error_threshold =1e-4f;
- Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
- Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
- Tensor<DataType, 0, DataLayout, IndexType> t_result;
- Tensor<DataType, 0, DataLayout, IndexType> t_result_gpu;
- Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(1, 1)}};
- Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
- Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
- t_left.setRandom();
- t_right.setRandom();
-
- std::size_t t_left_bytes = t_left.size() * sizeof(DataType);
- std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
- std::size_t t_result_bytes = sizeof(DataType);
-
-
- DataType * d_t_left = static_cast<DataType*>(sycl_device.allocate(t_left_bytes));
- DataType * d_t_right = static_cast<DataType*>(sycl_device.allocate(t_right_bytes));
- DataType * d_t_result = static_cast<DataType*>(sycl_device.allocate(t_result_bytes));
-
- Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims);
- Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims);
- Eigen::TensorMap<Eigen::Tensor<DataType, 0, DataLayout, IndexType> > gpu_t_result(d_t_result);
-
- sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes);
- sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes);
-
- gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
- sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes);
-
- t_result = t_left.contract(t_right, dims);
-
- if (static_cast<DataType>(fabs(t_result() - t_result_gpu())) > error_threshold &&
- !Eigen::internal::isApprox(t_result(), t_result_gpu(), error_threshold)) {
- std::cout << "mismatch detected: " << t_result()
- << " vs " << t_result_gpu() << std::endl;
- assert(false);
- }
-
- sycl_device.deallocate(d_t_left);
- sycl_device.deallocate(d_t_right);
- sycl_device.deallocate(d_t_result);
-}
-
-
-template<int DataLayout, typename DataType, typename IndexType, typename Device>
-void test_sycl_contraction_m(const Device& sycl_device) {
- for (IndexType k = 32; k < 256; k++) {
- test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, k, 128, 128);
- }
-}
-
-template<int DataLayout, typename DataType, typename IndexType, typename Device>
-void test_sycl_contraction_k(const Device& sycl_device) {
- for (IndexType k = 32; k < 256; k++) {
- test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, k, 128);
- }
-}
-
-template<int DataLayout, typename DataType, typename IndexType, typename Device>
-void test_sycl_contraction_n(const Device& sycl_device) {
- for (IndexType k = 32; k < 256; k++) {
- test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, 128, k);
- }
-}
-
-
-template<int DataLayout, typename DataType, typename IndexType, typename Device>
-void test_sycl_contraction_sizes(const Device& sycl_device) {
- IndexType m_sizes[] = { 31, 39, 63, 64, 65,
- 127, 129, 255, 257 , 511,
- 512, 513, 1023, 1024, 1025};
-
- IndexType n_sizes[] = { 31, 39, 63, 64, 65,
- 127, 129, 255, 257, 511,
- 512, 513, 1023, 1024, 1025};
-
- IndexType k_sizes[] = { 31, 39, 63, 64, 65,
- 95, 96, 127, 129, 255,
- 257, 511, 512, 513, 1023,
- 1024, 1025};
-
- for (IndexType i = 0; i < 15; i++) {
- for (IndexType j = 0; j < 15; j++) {
- for (IndexType k = 0; k < 17; k++) {
- test_sycl_contraction<DataLayout, DataType,IndexType>(sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]);
- }
- }
- }
-}
-
-template <typename Dev_selector> void tensorContractionPerDevice(Dev_selector& s){
- QueueInterface queueInterface(s);
- auto sycl_device=Eigen::SyclDevice(&queueInterface);
- test_sycl_contraction<ColMajor, float,int64_t>(sycl_device, 32, 32, 32);
- test_sycl_contraction<RowMajor,float,int64_t>(sycl_device, 32, 32, 32);
- test_scalar<ColMajor,float,int64_t>(sycl_device, 32, 32, 32);
- test_scalar<RowMajor,float,int64_t>(sycl_device, 32, 32, 32);
- std::chrono::time_point<std::chrono::system_clock> start, end;
- start = std::chrono::system_clock::now();
- test_sycl_contraction<ColMajor,float,int64_t>(sycl_device, 128, 128, 128);
- test_sycl_contraction<RowMajor,float,int64_t>(sycl_device, 128, 128, 128);
- test_scalar<ColMajor,float,int64_t>(sycl_device, 128, 128, 128);
- test_scalar<RowMajor,float,int64_t>(sycl_device, 128, 128, 128);
- test_sycl_contraction_m<ColMajor, float, int64_t>(sycl_device);
- test_sycl_contraction_m<RowMajor, float, int64_t>(sycl_device);
- test_sycl_contraction_n<ColMajor, float, int64_t>(sycl_device);
- test_sycl_contraction_n<RowMajor, float, int64_t>(sycl_device);
- test_sycl_contraction_k<ColMajor, float, int64_t>(sycl_device);
- test_sycl_contraction_k<RowMajor, float, int64_t>(sycl_device);
- test_sycl_contraction_sizes<ColMajor, float, int64_t>(sycl_device);
- test_sycl_contraction_sizes<RowMajor, float, int64_t>(sycl_device);
- test_TF<RowMajor, float, int64_t>(sycl_device);
- test_TF<ColMajor, float, int64_t>(sycl_device);
-
- end = std::chrono::system_clock::now();
- std::chrono::duration<double> elapsed_seconds = end-start;
- std::time_t end_time = std::chrono::system_clock::to_time_t(end);
- std::cout << "finished computation at " << std::ctime(&end_time)
- << "elapsed time: " << elapsed_seconds.count() << "s\n";
-
-}
-
-void test_cxx11_tensor_contract_sycl() {
- for (const auto& device :Eigen::get_sycl_supported_devices()) {
- CALL_SUBTEST(tensorContractionPerDevice(device));
- }
-}
diff --git a/eigen/unsupported/test/cxx11_tensor_convolution_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_convolution_sycl.cpp
deleted file mode 100644
index a4226a6..0000000
--- a/eigen/unsupported/test/cxx11_tensor_convolution_sycl.cpp
+++ /dev/null
@@ -1,469 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016
-// Mehdi Goli Codeplay Software Ltd.
-// Ralph Potter Codeplay Software Ltd.
-// Luke Iwanski Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_convolution_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
-#define EIGEN_USE_SYCL
-
-#include <iostream>
-#include <chrono>
-#include <ctime>
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-#include <iomanip>
-
-using Eigen::array;
-using Eigen::SyclDevice;
-using Eigen::Tensor;
-using Eigen::TensorMap;
-static const float error_threshold =1e-4f;
-
-
-template <typename DataType, int DataLayout, typename IndexType>
-static void test_larg_expr1D(const Eigen::SyclDevice& sycl_device)
-{
- IndexType indim0 =53;
- IndexType indim1= 55;
- IndexType indim2= 51;
- IndexType outdim0=50;
- IndexType outdim1=55;
- IndexType outdim2=51;
- Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
- Eigen::array<IndexType, 1> kernel_dims = {{4}};
- Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
-
- Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
- Tensor<DataType, 1, DataLayout,IndexType> kernel(kernel_dims);
- Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
- Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
-
- Eigen::array<IndexType, 1> dims3{{0}};
-
- input.setRandom();
- kernel.setRandom();
- result.setZero();
- result_host.setZero();
-
- std::size_t input_bytes = input.size() * sizeof(DataType);
- std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
- std::size_t result_bytes = result.size() * sizeof(DataType);
-
- DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes));
- DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
- DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes));
-
- Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
- Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
- Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
- sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
- sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
-
- gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
- sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
-
- result_host=input.convolve(kernel, dims3);
-
-for(IndexType i=0; i< outdim0; i++ ){
- for(IndexType j=0; j< outdim1; j++ ){
- for(IndexType k=0; k< outdim2; k++ ){
- if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
- std::cout <<std::setprecision(16)<< "mismatch detected at index ( "<< i << " , " << j << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<< result_host(i,j,k) << std::endl;
- assert(false);
- }
- }
- }
-}
- sycl_device.deallocate(d_input);
- sycl_device.deallocate(d_kernel);
- sycl_device.deallocate(d_result);
-
-}
-
-
-template <typename DataType, int DataLayout, typename IndexType>
-static void test_larg_expr2D(const Eigen::SyclDevice& sycl_device)
-{
- IndexType indim0 =53;
- IndexType indim1= 55;
- IndexType indim2= 51;
- IndexType outdim0=50;
- IndexType outdim1=51;
- IndexType outdim2=51;
- Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
- Eigen::array<IndexType, 2> kernel_dims = {{4,5}};
- Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
-
- Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
- Tensor<DataType, 2, DataLayout,IndexType> kernel(kernel_dims);
- Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
- Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
-
- Eigen::array<IndexType, 2> dims3{{0,1}};
-
- input.setRandom();
- kernel.setRandom();
- result.setZero();
- result_host.setZero();
-
- std::size_t input_bytes = input.size() * sizeof(DataType);
- std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
- std::size_t result_bytes = result.size() * sizeof(DataType);
-
- DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes));
- DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
- DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes));
-
- Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
- Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
- Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
- sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
- sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
-
- gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
- sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
-
- result_host=input.convolve(kernel, dims3);
-
-for(IndexType i=0; i< outdim0; i++ ){
- for(IndexType j=0; j< outdim1; j++ ){
- for(IndexType k=0; k< outdim2; k++ ){
- if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
- std::cout <<std::setprecision(16)<< "mismatch detected at index ( "<< i << " , " << j << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<< result_host(i,j,k) << std::endl;
- assert(false);
- }
- }
- }
-}
- sycl_device.deallocate(d_input);
- sycl_device.deallocate(d_kernel);
- sycl_device.deallocate(d_result);
-
-}
-
-
-template <typename DataType, int DataLayout, typename IndexType>
-static void test_larg_expr3D(const Eigen::SyclDevice& sycl_device)
-{
- IndexType indim0 =53;
- IndexType indim1= 55;
- IndexType indim2= 51;
- IndexType outdim0=50;
- IndexType outdim1=51;
- IndexType outdim2=49;
- Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
- Eigen::array<IndexType, 3> kernel_dims = {{4,5,3}};
- Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
-
- Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
- Tensor<DataType, 3, DataLayout,IndexType> kernel(kernel_dims);
- Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
- Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
-
- Eigen::array<IndexType, 3> dims3{{0,1,2}};
-
- input.setRandom();
- kernel.setRandom();
- result.setZero();
- result_host.setZero();
-
- std::size_t input_bytes = input.size() * sizeof(DataType);
- std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
- std::size_t result_bytes = result.size() * sizeof(DataType);
-
- DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes));
- DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
- DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes));
-
- Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
- Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
- Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
- sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
- sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
-
- gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
- sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
-
- result_host=input.convolve(kernel, dims3);
-
-for(IndexType i=0; i< outdim0; i++ ){
- for(IndexType j=0; j< outdim1; j++ ){
- for(IndexType k=0; k< outdim2; k++ ){
- if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
- std::cout <<std::setprecision(16)<< "mismatch detected at index ( "<< i << " , " << j << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<< result_host(i,j,k) << std::endl;
- assert(false);
- }
- }
- }
-}
- sycl_device.deallocate(d_input);
- sycl_device.deallocate(d_kernel);
- sycl_device.deallocate(d_result);
-
-}
-
-
-template <typename DataType, int DataLayout, typename IndexType>
-static void test_evals(const Eigen::SyclDevice& sycl_device)
-{
- Eigen::array<IndexType, 2> input_dims = {{3, 3}};
- Eigen::array<IndexType, 1> kernel_dims = {{2}};
- Eigen::array<IndexType, 2> result_dims = {{2, 3}};
-
- Tensor<DataType, 2, DataLayout, IndexType> input(input_dims);
- Tensor<DataType, 1, DataLayout,IndexType> kernel(kernel_dims);
- Tensor<DataType, 2, DataLayout,IndexType> result(result_dims);
-
- Eigen::array<IndexType, 1> dims3{{0}};
-
- input.setRandom();
- kernel.setRandom();
- result.setZero();
-
- std::size_t input_bytes = input.size() * sizeof(DataType);
- std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
- std::size_t result_bytes = result.size() * sizeof(DataType);
-
- DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes));
- DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
- DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes));
-
- Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_input(d_input, input_dims);
- Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
- Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_result(d_result, result_dims);
- sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
- sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
-
- gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
- sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
-
- VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0) + input(1,0)*kernel(1)); // index 0
- VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0) + input(1,1)*kernel(1)); // index 2
- VERIFY_IS_APPROX(result(0,2), input(0,2)*kernel(0) + input(1,2)*kernel(1)); // index 4
- VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0) + input(2,0)*kernel(1)); // index 1
- VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0) + input(2,1)*kernel(1)); // index 3
- VERIFY_IS_APPROX(result(1,2), input(1,2)*kernel(0) + input(2,2)*kernel(1)); // index 5
-
- sycl_device.deallocate(d_input);
- sycl_device.deallocate(d_kernel);
- sycl_device.deallocate(d_result);
-}
-
-template <typename DataType, int DataLayout, typename IndexType>
-static void test_expr(const Eigen::SyclDevice& sycl_device)
-{
- Eigen::array<IndexType, 2> input_dims = {{3, 3}};
- Eigen::array<IndexType, 2> kernel_dims = {{2, 2}};
- Eigen::array<IndexType, 2> result_dims = {{2, 2}};
-
- Tensor<DataType, 2, DataLayout, IndexType> input(input_dims);
- Tensor<DataType, 2, DataLayout, IndexType> kernel(kernel_dims);
- Tensor<DataType, 2, DataLayout, IndexType> result(result_dims);
-
- input.setRandom();
- kernel.setRandom();
- Eigen::array<IndexType, 2> dims;
- dims[0] = 0;
- dims[1] = 1;
-
- std::size_t input_bytes = input.size() * sizeof(DataType);
- std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
- std::size_t result_bytes = result.size() * sizeof(DataType);
-
- DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes));
- DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
- DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes));
-
- Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_input(d_input, input_dims);
- Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
- Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_result(d_result, result_dims);
- sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
- sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
-
- gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims);
- sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
-
- VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0,0) + input(0,1)*kernel(0,1) +
- input(1,0)*kernel(1,0) + input(1,1)*kernel(1,1));
- VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0,0) + input(0,2)*kernel(0,1) +
- input(1,1)*kernel(1,0) + input(1,2)*kernel(1,1));
- VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0,0) + input(1,1)*kernel(0,1) +
- input(2,0)*kernel(1,0) + input(2,1)*kernel(1,1));
- VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0,0) + input(1,2)*kernel(0,1) +
- input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1));
-
- sycl_device.deallocate(d_input);
- sycl_device.deallocate(d_kernel);
- sycl_device.deallocate(d_result);
-}
-
-
-template <typename DataType, int DataLayout, typename IndexType>
-static void test_modes(const Eigen::SyclDevice& sycl_device){
-
-Eigen::array<IndexType, 1> input_dims = {{3}};
-Eigen::array<IndexType, 1> kernel_dims = {{3}};
-
-Tensor<DataType, 1, DataLayout, IndexType> input(input_dims);
-Tensor<DataType, 1, DataLayout, IndexType> kernel(kernel_dims);
-
-input.setRandom();
-kernel.setRandom();
-Eigen::array<IndexType, 1> dims;
-dims[0] = 0;
-
- input(0) = 1.0f;
- input(1) = 2.0f;
- input(2) = 3.0f;
- kernel(0) = 0.5f;
- kernel(1) = 1.0f;
- kernel(2) = 0.0f;
-
- Eigen::array<std::pair<IndexType, IndexType>, 1> padding;
-
- // Emulate VALID mode (as defined in
- // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
- padding[0] = std::make_pair(0, 0);
- Tensor<DataType, 1, DataLayout, IndexType> valid(1);
-
- std::size_t input_bytes = input.size() * sizeof(DataType);
- std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
- std::size_t valid_bytes = valid.size() * sizeof(DataType);
-
- DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes));
- DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
- DataType * d_valid = static_cast<DataType*>(sycl_device.allocate(valid_bytes));
-
- Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_input(d_input, input_dims);
- Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
- Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_valid(d_valid, valid.dimensions());
- sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
- sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
-
- gpu_valid.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
- sycl_device.memcpyDeviceToHost(valid.data(), d_valid, valid_bytes);
-
- VERIFY_IS_EQUAL(valid.dimension(0), 1);
- VERIFY_IS_APPROX(valid(0), 2.5f);
-
- // Emulate SAME mode (as defined in
- // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
- padding[0] = std::make_pair(1, 1);
- Tensor<DataType, 1, DataLayout, IndexType> same(3);
- std::size_t same_bytes = same.size() * sizeof(DataType);
- DataType * d_same = static_cast<DataType*>(sycl_device.allocate(same_bytes));
- Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_same(d_same, same.dimensions());
- gpu_same.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
- sycl_device.memcpyDeviceToHost(same.data(), d_same, same_bytes);
-
- VERIFY_IS_EQUAL(same.dimension(0), 3);
- VERIFY_IS_APPROX(same(0), 1.0f);
- VERIFY_IS_APPROX(same(1), 2.5f);
- VERIFY_IS_APPROX(same(2), 4.0f);
-
- // Emulate FULL mode (as defined in
- // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
- padding[0] = std::make_pair(2, 2);
-
- Tensor<DataType, 1, DataLayout, IndexType> full(5);
- std::size_t full_bytes = full.size() * sizeof(DataType);
- DataType * d_full = static_cast<DataType*>(sycl_device.allocate(full_bytes));
- Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_full(d_full, full.dimensions());
- gpu_full.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
- sycl_device.memcpyDeviceToHost(full.data(), d_full, full_bytes);
-
- VERIFY_IS_EQUAL(full.dimension(0), 5);
- VERIFY_IS_APPROX(full(0), 0.0f);
- VERIFY_IS_APPROX(full(1), 1.0f);
- VERIFY_IS_APPROX(full(2), 2.5f);
- VERIFY_IS_APPROX(full(3), 4.0f);
- VERIFY_IS_APPROX(full(4), 1.5f);
-
- sycl_device.deallocate(d_input);
- sycl_device.deallocate(d_kernel);
- sycl_device.deallocate(d_valid);
- sycl_device.deallocate(d_same);
- sycl_device.deallocate(d_full);
-
-}
-
-template <typename DataType, int DataLayout, typename IndexType>
-static void test_strides(const Eigen::SyclDevice& sycl_device){
-
- Eigen::array<IndexType, 1> input_dims = {{13}};
- Eigen::array<IndexType, 1> kernel_dims = {{3}};
-
- Tensor<DataType, 1, DataLayout, IndexType> input(input_dims);
- Tensor<DataType, 1, DataLayout, IndexType> kernel(kernel_dims);
- Tensor<DataType, 1, DataLayout, IndexType> result(2);
-
- input.setRandom();
- kernel.setRandom();
- Eigen::array<IndexType, 1> dims;
- dims[0] = 0;
-
- Eigen::array<IndexType, 1> stride_of_3;
- stride_of_3[0] = 3;
- Eigen::array<IndexType, 1> stride_of_2;
- stride_of_2[0] = 2;
-
- std::size_t input_bytes = input.size() * sizeof(DataType);
- std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
- std::size_t result_bytes = result.size() * sizeof(DataType);
-
- DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes));
- DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
- DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes));
-
- Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_input(d_input, input_dims);
- Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
- Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_result(d_result, result.dimensions());
- sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
- sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
-
- gpu_result.device(sycl_device)=gpu_input.stride(stride_of_3).convolve(gpu_kernel, dims).stride(stride_of_2);
- sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
-
- VERIFY_IS_EQUAL(result.dimension(0), 2);
- VERIFY_IS_APPROX(result(0), (input(0)*kernel(0) + input(3)*kernel(1) +
- input(6)*kernel(2)));
- VERIFY_IS_APPROX(result(1), (input(6)*kernel(0) + input(9)*kernel(1) +
- input(12)*kernel(2)));
-}
-
-template <typename Dev_selector> void tensorConvolutionPerDevice(Dev_selector& s){
- QueueInterface queueInterface(s);
- auto sycl_device=Eigen::SyclDevice(&queueInterface);
- test_larg_expr1D<float, RowMajor, int64_t>(sycl_device);
- test_larg_expr1D<float, ColMajor, int64_t>(sycl_device);
- test_larg_expr2D<float, RowMajor, int64_t>(sycl_device);
- test_larg_expr2D<float, ColMajor, int64_t>(sycl_device);
- test_larg_expr3D<float, RowMajor, int64_t>(sycl_device);
- test_larg_expr3D<float, ColMajor, int64_t>(sycl_device);
- test_evals<float, ColMajor, int64_t>(sycl_device);
- test_evals<float, RowMajor, int64_t>(sycl_device);
- test_expr<float, ColMajor, int64_t>(sycl_device);
- test_expr<float, RowMajor, int64_t>(sycl_device);
- test_modes<float, ColMajor, int64_t>(sycl_device);
- test_modes<float, RowMajor, int64_t>(sycl_device);
- test_strides<float, ColMajor, int64_t>(sycl_device);
- test_strides<float, RowMajor, int64_t>(sycl_device);
-}
-
-void test_cxx11_tensor_convolution_sycl() {
- for (const auto& device :Eigen::get_sycl_supported_devices()) {
- CALL_SUBTEST(tensorConvolutionPerDevice(device));
- }
-}
diff --git a/eigen/unsupported/test/cxx11_tensor_device_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_device_sycl.cpp
index 3ecc68d..7f79753 100644
--- a/eigen/unsupported/test/cxx11_tensor_device_sycl.cpp
+++ b/eigen/unsupported/test/cxx11_tensor_device_sycl.cpp
@@ -14,64 +14,18 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_device_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_SYCL
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
-#include <stdint.h>
-#include <iostream>
-template <typename DataType, int DataLayout, typename IndexType>
-void test_device_memory(const Eigen::SyclDevice &sycl_device) {
- std::cout << "Running on : "
- << sycl_device.sycl_queue().get_device(). template get_info<cl::sycl::info::device::name>()
- <<std::endl;
- IndexType sizeDim1 = 100;
- array<IndexType, 1> tensorRange = {{sizeDim1}};
- Tensor<DataType, 1, DataLayout,IndexType> in(tensorRange);
- Tensor<DataType, 1, DataLayout,IndexType> in1(tensorRange);
- memset(in1.data(), 1, in1.size() * sizeof(DataType));
- DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
- sycl_device.memset(gpu_in_data, 1, in.size()*sizeof(DataType));
- sycl_device.memcpyDeviceToHost(in.data(), gpu_in_data, in.size()*sizeof(DataType));
- for (IndexType i=0; i<in.size(); i++) {
- VERIFY_IS_EQUAL(in(i), in1(i));
- }
- sycl_device.deallocate(gpu_in_data);
+void test_device_sycl(const Eigen::SyclDevice &sycl_device) {
+ std::cout <<"Helo from ComputeCpp: the requested device exists and the device name is : "
+ << sycl_device.m_queue.get_device(). template get_info<cl::sycl::info::device::name>() <<std::endl;;
}
-
-template <typename DataType, int DataLayout, typename IndexType>
-void test_device_exceptions(const Eigen::SyclDevice &sycl_device) {
- VERIFY(sycl_device.ok());
- IndexType sizeDim1 = 100;
- array<IndexType, 1> tensorDims = {{sizeDim1}};
- DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(sizeDim1*sizeof(DataType)));
- sycl_device.memset(gpu_data, 1, sizeDim1*sizeof(DataType));
-
- TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> in(gpu_data, tensorDims);
- TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> out(gpu_data, tensorDims);
- out.device(sycl_device) = in / in.constant(0);
-
- sycl_device.synchronize();
- VERIFY(!sycl_device.ok());
- sycl_device.deallocate(gpu_data);
-}
-
-template<typename DataType> void sycl_device_test_per_device(const cl::sycl::device& d){
- std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
- QueueInterface queueInterface(d);
- auto sycl_device = Eigen::SyclDevice(&queueInterface);
- test_device_memory<DataType, RowMajor, int64_t>(sycl_device);
- test_device_memory<DataType, ColMajor, int64_t>(sycl_device);
- /// this test throw an exception. enable it if you want to see the exception
- //test_device_exceptions<DataType, RowMajor>(sycl_device);
- /// this test throw an exception. enable it if you want to see the exception
- //test_device_exceptions<DataType, ColMajor>(sycl_device);
-}
-
void test_cxx11_tensor_device_sycl() {
- for (const auto& device :Eigen::get_sycl_supported_devices()) {
- CALL_SUBTEST(sycl_device_test_per_device<float>(device));
- }
+ cl::sycl::gpu_selector s;
+ Eigen::SyclDevice sycl_device(s);
+ CALL_SUBTEST(test_device_sycl(sycl_device));
}
diff --git a/eigen/unsupported/test/cxx11_tensor_expr.cpp b/eigen/unsupported/test/cxx11_tensor_expr.cpp
index 129b4e6..77e24cb 100644
--- a/eigen/unsupported/test/cxx11_tensor_expr.cpp
+++ b/eigen/unsupported/test/cxx11_tensor_expr.cpp
@@ -300,51 +300,6 @@ static void test_select()
}
}
-template <typename Scalar>
-void test_minmax_nan_propagation_templ() {
- for (int size = 1; size < 17; ++size) {
- const Scalar kNan = std::numeric_limits<Scalar>::quiet_NaN();
- Tensor<Scalar, 1> vec_nan(size);
- Tensor<Scalar, 1> vec_zero(size);
- Tensor<Scalar, 1> vec_res(size);
- vec_nan.setConstant(kNan);
- vec_zero.setZero();
- vec_res.setZero();
-
- // Test that we propagate NaNs in the tensor when applying the
- // cwiseMax(scalar) operator, which is used for the Relu operator.
- vec_res = vec_nan.cwiseMax(Scalar(0));
- for (int i = 0; i < size; ++i) {
- VERIFY((numext::isnan)(vec_res(i)));
- }
-
- // Test that NaNs do not propagate if we reverse the arguments.
- vec_res = vec_zero.cwiseMax(kNan);
- for (int i = 0; i < size; ++i) {
- VERIFY_IS_EQUAL(vec_res(i), Scalar(0));
- }
-
- // Test that we propagate NaNs in the tensor when applying the
- // cwiseMin(scalar) operator.
- vec_res.setZero();
- vec_res = vec_nan.cwiseMin(Scalar(0));
- for (int i = 0; i < size; ++i) {
- VERIFY((numext::isnan)(vec_res(i)));
- }
-
- // Test that NaNs do not propagate if we reverse the arguments.
- vec_res = vec_zero.cwiseMin(kNan);
- for (int i = 0; i < size; ++i) {
- VERIFY_IS_EQUAL(vec_res(i), Scalar(0));
- }
- }
-}
-
-static void test_minmax_nan_propagation()
-{
- test_minmax_nan_propagation_templ<float>();
- test_minmax_nan_propagation_templ<double>();
-}
void test_cxx11_tensor_expr()
{
@@ -356,5 +311,4 @@ void test_cxx11_tensor_expr()
CALL_SUBTEST(test_functors());
CALL_SUBTEST(test_type_casting());
CALL_SUBTEST(test_select());
- CALL_SUBTEST(test_minmax_nan_propagation());
}
diff --git a/eigen/unsupported/test/cxx11_tensor_fixed_size.cpp b/eigen/unsupported/test/cxx11_tensor_fixed_size.cpp
index e6274f8..4c660de 100644
--- a/eigen/unsupported/test/cxx11_tensor_fixed_size.cpp
+++ b/eigen/unsupported/test/cxx11_tensor_fixed_size.cpp
@@ -21,7 +21,7 @@ static void test_0d()
TensorFixedSize<float, Sizes<>, RowMajor> scalar2;
VERIFY_IS_EQUAL(scalar1.rank(), 0);
VERIFY_IS_EQUAL(scalar1.size(), 1);
- VERIFY_IS_EQUAL(internal::array_prod(scalar1.dimensions()), 1);
+ VERIFY_IS_EQUAL(array_prod(scalar1.dimensions()), 1);
scalar1() = 7.0;
scalar2() = 13.0;
diff --git a/eigen/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
index aca036c..5690da7 100644
--- a/eigen/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
+++ b/eigen/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
@@ -14,43 +14,43 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_forced_eval_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_SYCL
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
using Eigen::Tensor;
-template <typename DataType, int DataLayout, typename IndexType>
+
void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) {
- IndexType sizeDim1 = 100;
- IndexType sizeDim2 = 20;
- IndexType sizeDim3 = 20;
- Eigen::array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
- Eigen::Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange);
- Eigen::Tensor<DataType, 3, DataLayout, IndexType> in2(tensorRange);
- Eigen::Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange);
+ int sizeDim1 = 100;
+ int sizeDim2 = 200;
+ int sizeDim3 = 200;
+ Eigen::array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+ Eigen::Tensor<float, 3> in1(tensorRange);
+ Eigen::Tensor<float, 3> in2(tensorRange);
+ Eigen::Tensor<float, 3> out(tensorRange);
- DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType)));
- DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType)));
- DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType)));
+ float * gpu_in1_data = static_cast<float*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(float)));
+ float * gpu_in2_data = static_cast<float*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(float)));
+ float * gpu_out_data = static_cast<float*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(float)));
in1 = in1.random() + in1.constant(10.0f);
in2 = in2.random() + in2.constant(10.0f);
// creating TensorMap from tensor
- Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange);
- Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange);
- Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
- sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
- sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
+ Eigen::TensorMap<Eigen::Tensor<float, 3>> gpu_in1(gpu_in1_data, tensorRange);
+ Eigen::TensorMap<Eigen::Tensor<float, 3>> gpu_in2(gpu_in2_data, tensorRange);
+ Eigen::TensorMap<Eigen::Tensor<float, 3>> gpu_out(gpu_out_data, tensorRange);
+ sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(float));
+ sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in1.dimensions().TotalSize())*sizeof(float));
/// c=(a+b)*b
gpu_out.device(sycl_device) =(gpu_in1 + gpu_in2).eval() * gpu_in2;
- sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
- for (IndexType i = 0; i < sizeDim1; ++i) {
- for (IndexType j = 0; j < sizeDim2; ++j) {
- for (IndexType k = 0; k < sizeDim3; ++k) {
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
+ for (int i = 0; i < sizeDim1; ++i) {
+ for (int j = 0; j < sizeDim2; ++j) {
+ for (int k = 0; k < sizeDim3; ++k) {
VERIFY_IS_APPROX(out(i, j, k),
(in1(i, j, k) + in2(i, j, k)) * in2(i, j, k));
}
@@ -63,14 +63,8 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) {
}
-template <typename DataType, typename Dev_selector> void tensorForced_evalperDevice(Dev_selector s){
- QueueInterface queueInterface(s);
- auto sycl_device = Eigen::SyclDevice(&queueInterface);
- test_forced_eval_sycl<DataType, RowMajor, int64_t>(sycl_device);
- test_forced_eval_sycl<DataType, ColMajor, int64_t>(sycl_device);
-}
void test_cxx11_tensor_forced_eval_sycl() {
- for (const auto& device :Eigen::get_sycl_supported_devices()) {
- CALL_SUBTEST(tensorForced_evalperDevice<float>(device));
- }
+ cl::sycl::gpu_selector s;
+ Eigen::SyclDevice sycl_device(s);
+ CALL_SUBTEST(test_forced_eval_sycl(sycl_device));
}
diff --git a/eigen/unsupported/test/cxx11_tensor_morphing_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_morphing_sycl.cpp
deleted file mode 100644
index 9b521bc..0000000
--- a/eigen/unsupported/test/cxx11_tensor_morphing_sycl.cpp
+++ /dev/null
@@ -1,248 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016
-// Mehdi Goli Codeplay Software Ltd.
-// Ralph Potter Codeplay Software Ltd.
-// Luke Iwanski Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-// Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_morphing_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
-#define EIGEN_USE_SYCL
-
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-using Eigen::array;
-using Eigen::SyclDevice;
-using Eigen::Tensor;
-using Eigen::TensorMap;
-
-template <typename DataType, int DataLayout, typename IndexType>
-static void test_simple_reshape(const Eigen::SyclDevice& sycl_device)
-{
- typename Tensor<DataType, 5 ,DataLayout, IndexType>::Dimensions dim1(2,3,1,7,1);
- typename Tensor<DataType, 3 ,DataLayout, IndexType>::Dimensions dim2(2,3,7);
- typename Tensor<DataType, 2 ,DataLayout, IndexType>::Dimensions dim3(6,7);
- typename Tensor<DataType, 2 ,DataLayout, IndexType>::Dimensions dim4(2,21);
-
- Tensor<DataType, 5, DataLayout, IndexType> tensor1(dim1);
- Tensor<DataType, 3, DataLayout, IndexType> tensor2(dim2);
- Tensor<DataType, 2, DataLayout, IndexType> tensor3(dim3);
- Tensor<DataType, 2, DataLayout, IndexType> tensor4(dim4);
-
- tensor1.setRandom();
-
- DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType)));
- DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
- DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(tensor3.size()*sizeof(DataType)));
- DataType* gpu_data4 = static_cast<DataType*>(sycl_device.allocate(tensor4.size()*sizeof(DataType)));
-
- TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, dim1);
- TensorMap<Tensor<DataType, 3,DataLayout, IndexType>> gpu2(gpu_data2, dim2);
- TensorMap<Tensor<DataType, 2,DataLayout, IndexType>> gpu3(gpu_data3, dim3);
- TensorMap<Tensor<DataType, 2,DataLayout, IndexType>> gpu4(gpu_data4, dim4);
-
- sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType));
-
- gpu2.device(sycl_device)=gpu1.reshape(dim2);
- sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor1.size())*sizeof(DataType));
-
- gpu3.device(sycl_device)=gpu1.reshape(dim3);
- sycl_device.memcpyDeviceToHost(tensor3.data(), gpu_data3,(tensor3.size())*sizeof(DataType));
-
- gpu4.device(sycl_device)=gpu1.reshape(dim2).reshape(dim4);
- sycl_device.memcpyDeviceToHost(tensor4.data(), gpu_data4,(tensor4.size())*sizeof(DataType));
- for (IndexType i = 0; i < 2; ++i){
- for (IndexType j = 0; j < 3; ++j){
- for (IndexType k = 0; k < 7; ++k){
- VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor2(i,j,k)); ///ColMajor
- if (static_cast<int>(DataLayout) == static_cast<int>(ColMajor)) {
- VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor3(i+2*j,k)); ///ColMajor
- VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor4(i,j+3*k)); ///ColMajor
- }
- else{
- //VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor2(i,j,k)); /// RowMajor
- VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor4(i,j*7 +k)); /// RowMajor
- VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor3(i*3 +j,k)); /// RowMajor
- }
- }
- }
- }
- sycl_device.deallocate(gpu_data1);
- sycl_device.deallocate(gpu_data2);
- sycl_device.deallocate(gpu_data3);
- sycl_device.deallocate(gpu_data4);
-}
-
-
-template<typename DataType, int DataLayout, typename IndexType>
-static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device)
-{
- typename Tensor<DataType, 3, DataLayout, IndexType>::Dimensions dim1(2,3,7);
- typename Tensor<DataType, 2, DataLayout, IndexType>::Dimensions dim2(6,7);
- typename Tensor<DataType, 5, DataLayout, IndexType>::Dimensions dim3(2,3,1,7,1);
- Tensor<DataType, 3, DataLayout, IndexType> tensor(dim1);
- Tensor<DataType, 2, DataLayout, IndexType> tensor2d(dim2);
- Tensor<DataType, 5, DataLayout, IndexType> tensor5d(dim3);
-
- tensor.setRandom();
-
- DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
- DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(tensor2d.size()*sizeof(DataType)));
- DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(tensor5d.size()*sizeof(DataType)));
-
- TensorMap< Tensor<DataType, 3, DataLayout, IndexType> > gpu1(gpu_data1, dim1);
- TensorMap< Tensor<DataType, 2, DataLayout, IndexType> > gpu2(gpu_data2, dim2);
- TensorMap< Tensor<DataType, 5, DataLayout, IndexType> > gpu3(gpu_data3, dim3);
-
- sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
-
- gpu2.reshape(dim1).device(sycl_device)=gpu1;
- sycl_device.memcpyDeviceToHost(tensor2d.data(), gpu_data2,(tensor2d.size())*sizeof(DataType));
-
- gpu3.reshape(dim1).device(sycl_device)=gpu1;
- sycl_device.memcpyDeviceToHost(tensor5d.data(), gpu_data3,(tensor5d.size())*sizeof(DataType));
-
-
- for (IndexType i = 0; i < 2; ++i){
- for (IndexType j = 0; j < 3; ++j){
- for (IndexType k = 0; k < 7; ++k){
- VERIFY_IS_EQUAL(tensor5d(i,j,0,k,0), tensor(i,j,k));
- if (static_cast<int>(DataLayout) == static_cast<int>(ColMajor)) {
- VERIFY_IS_EQUAL(tensor2d(i+2*j,k), tensor(i,j,k)); ///ColMajor
- }
- else{
- VERIFY_IS_EQUAL(tensor2d(i*3 +j,k),tensor(i,j,k)); /// RowMajor
- }
- }
- }
- }
- sycl_device.deallocate(gpu_data1);
- sycl_device.deallocate(gpu_data2);
- sycl_device.deallocate(gpu_data3);
-}
-
-
-template <typename DataType, int DataLayout, typename IndexType>
-static void test_simple_slice(const Eigen::SyclDevice &sycl_device)
-{
- IndexType sizeDim1 = 2;
- IndexType sizeDim2 = 3;
- IndexType sizeDim3 = 5;
- IndexType sizeDim4 = 7;
- IndexType sizeDim5 = 11;
- array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
- Tensor<DataType, 5,DataLayout, IndexType> tensor(tensorRange);
- tensor.setRandom();
- array<IndexType, 5> slice1_range ={{1, 1, 1, 1, 1}};
- Tensor<DataType, 5,DataLayout, IndexType> slice1(slice1_range);
-
- DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
- DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(slice1.size()*sizeof(DataType)));
- TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
- TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu2(gpu_data2, slice1_range);
- Eigen::DSizes<IndexType, 5> indices(1,2,3,4,5);
- Eigen::DSizes<IndexType, 5> sizes(1,1,1,1,1);
- sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
- gpu2.device(sycl_device)=gpu1.slice(indices, sizes);
- sycl_device.memcpyDeviceToHost(slice1.data(), gpu_data2,(slice1.size())*sizeof(DataType));
- VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));
-
-
- array<IndexType, 5> slice2_range ={{1,1,2,2,3}};
- Tensor<DataType, 5,DataLayout, IndexType> slice2(slice2_range);
- DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(slice2.size()*sizeof(DataType)));
- TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu3(gpu_data3, slice2_range);
- Eigen::DSizes<IndexType, 5> indices2(1,1,3,4,5);
- Eigen::DSizes<IndexType, 5> sizes2(1,1,2,2,3);
- gpu3.device(sycl_device)=gpu1.slice(indices2, sizes2);
- sycl_device.memcpyDeviceToHost(slice2.data(), gpu_data3,(slice2.size())*sizeof(DataType));
- for (IndexType i = 0; i < 2; ++i) {
- for (IndexType j = 0; j < 2; ++j) {
- for (IndexType k = 0; k < 3; ++k) {
- VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
- }
- }
- }
- sycl_device.deallocate(gpu_data1);
- sycl_device.deallocate(gpu_data2);
- sycl_device.deallocate(gpu_data3);
-}
-
-template<typename DataType, int DataLayout, typename IndexType>
-static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device)
-{
- typedef Tensor<DataType, 2, DataLayout, IndexType> Tensor2f;
- typedef Eigen::DSizes<IndexType, 2> Index2;
- IndexType sizeDim1 = 7L;
- IndexType sizeDim2 = 11L;
- array<IndexType, 2> tensorRange = {{sizeDim1, sizeDim2}};
- Tensor<DataType, 2, DataLayout, IndexType> tensor(tensorRange),tensor2(tensorRange);
- IndexType sliceDim1 = 2;
- IndexType sliceDim2 = 3;
- array<IndexType, 2> sliceRange = {{sliceDim1, sliceDim2}};
- Tensor2f slice(sliceRange);
- Index2 strides(1L,1L);
- Index2 indicesStart(3L,4L);
- Index2 indicesStop(5L,7L);
- Index2 lengths(2L,3L);
-
- DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
- DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType)));
- DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(slice.size()*sizeof(DataType)));
- TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu1(gpu_data1, tensorRange);
- TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu2(gpu_data2, tensorRange);
- TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu3(gpu_data3, sliceRange);
-
-
- tensor.setRandom();
- sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
- gpu2.device(sycl_device)=gpu1;
-
- slice.setRandom();
- sycl_device.memcpyHostToDevice(gpu_data3, slice.data(),(slice.size())*sizeof(DataType));
-
-
- gpu1.slice(indicesStart,lengths).device(sycl_device)=gpu3;
- gpu2.stridedSlice(indicesStart,indicesStop,strides).device(sycl_device)=gpu3;
- sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data1,(tensor.size())*sizeof(DataType));
- sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType));
-
- for(IndexType i=0;i<sizeDim1;i++)
- for(IndexType j=0;j<sizeDim2;j++){
- VERIFY_IS_EQUAL(tensor(i,j), tensor2(i,j));
- }
- sycl_device.deallocate(gpu_data1);
- sycl_device.deallocate(gpu_data2);
- sycl_device.deallocate(gpu_data3);
-}
-
-template<typename DataType, typename dev_Selector> void sycl_morphing_test_per_device(dev_Selector s){
- QueueInterface queueInterface(s);
- auto sycl_device = Eigen::SyclDevice(&queueInterface);
- test_simple_slice<DataType, RowMajor, int64_t>(sycl_device);
- test_simple_slice<DataType, ColMajor, int64_t>(sycl_device);
- test_simple_reshape<DataType, RowMajor, int64_t>(sycl_device);
- test_simple_reshape<DataType, ColMajor, int64_t>(sycl_device);
- test_reshape_as_lvalue<DataType, RowMajor, int64_t>(sycl_device);
- test_reshape_as_lvalue<DataType, ColMajor, int64_t>(sycl_device);
- test_strided_slice_write_sycl<DataType, ColMajor, int64_t>(sycl_device);
- test_strided_slice_write_sycl<DataType, RowMajor, int64_t>(sycl_device);
-}
-void test_cxx11_tensor_morphing_sycl()
-{
- for (const auto& device :Eigen::get_sycl_supported_devices()) {
- CALL_SUBTEST(sycl_morphing_test_per_device<float>(device));
- }
-}
diff --git a/eigen/unsupported/test/cxx11_tensor_notification.cpp b/eigen/unsupported/test/cxx11_tensor_notification.cpp
index 183ef02..c946007 100644
--- a/eigen/unsupported/test/cxx11_tensor_notification.cpp
+++ b/eigen/unsupported/test/cxx11_tensor_notification.cpp
@@ -13,6 +13,15 @@
#include "main.h"
#include <Eigen/CXX11/Tensor>
+#if EIGEN_OS_WIN || EIGEN_OS_WIN64
+#include <windows.h>
+void sleep(int seconds) {
+ Sleep(seconds*1000);
+}
+#else
+#include <unistd.h>
+#endif
+
namespace {
@@ -31,7 +40,7 @@ static void test_notification_single()
Eigen::Notification n;
std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter);
thread_pool.Schedule(func);
- EIGEN_SLEEP(1000);
+ sleep(1);
// The thread should be waiting for the notification.
VERIFY_IS_EQUAL(counter, 0);
@@ -39,7 +48,7 @@ static void test_notification_single()
// Unblock the thread
n.Notify();
- EIGEN_SLEEP(1000);
+ sleep(1);
// Verify the counter has been incremented
VERIFY_IS_EQUAL(counter, 1);
@@ -58,10 +67,10 @@ static void test_notification_multiple()
thread_pool.Schedule(func);
thread_pool.Schedule(func);
thread_pool.Schedule(func);
- EIGEN_SLEEP(1000);
+ sleep(1);
VERIFY_IS_EQUAL(counter, 0);
n.Notify();
- EIGEN_SLEEP(1000);
+ sleep(1);
VERIFY_IS_EQUAL(counter, 4);
}
diff --git a/eigen/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/eigen/unsupported/test/cxx11_tensor_of_float16_cuda.cu
index 908a5e5..2f86980 100644
--- a/eigen/unsupported/test/cxx11_tensor_of_float16_cuda.cu
+++ b/eigen/unsupported/test/cxx11_tensor_of_float16_cuda.cu
@@ -200,8 +200,6 @@ void test_cuda_trancendental() {
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_half(d_res3_half, num_elem);
- Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_float(d_res3_float, num_elem);
gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
@@ -209,7 +207,6 @@ void test_cuda_trancendental() {
gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>();
gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>();
gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>();
- gpu_res4_float.device(gpu_device) = gpu_float3.expm1().cast<Eigen::half>();
gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>();
gpu_res1_half.device(gpu_device) = gpu_res1_half.exp();
@@ -220,9 +217,6 @@ void test_cuda_trancendental() {
gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p();
- gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
- gpu_res3_half.device(gpu_device) = gpu_res3_half.expm1();
-
Tensor<float, 1> input1(num_elem);
Tensor<Eigen::half, 1> half_prec1(num_elem);
Tensor<Eigen::half, 1> full_prec1(num_elem);
diff --git a/eigen/unsupported/test/cxx11_tensor_padding_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_padding_sycl.cpp
deleted file mode 100644
index dc748b7..0000000
--- a/eigen/unsupported/test/cxx11_tensor_padding_sycl.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016
-// Mehdi Goli Codeplay Software Ltd.
-// Ralph Potter Codeplay Software Ltd.
-// Luke Iwanski Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-// Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_padding_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
-#define EIGEN_USE_SYCL
-
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-using Eigen::array;
-using Eigen::SyclDevice;
-using Eigen::Tensor;
-using Eigen::TensorMap;
-
-
-template<typename DataType, int DataLayout, typename IndexType>
-static void test_simple_padding(const Eigen::SyclDevice& sycl_device)
-{
-
- IndexType sizeDim1 = 2;
- IndexType sizeDim2 = 3;
- IndexType sizeDim3 = 5;
- IndexType sizeDim4 = 7;
- array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
-
- Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
- tensor.setRandom();
-
- array<std::pair<IndexType, IndexType>, 4> paddings;
- paddings[0] = std::make_pair(0, 0);
- paddings[1] = std::make_pair(2, 1);
- paddings[2] = std::make_pair(3, 4);
- paddings[3] = std::make_pair(0, 0);
-
- IndexType padedSizeDim1 = 2;
- IndexType padedSizeDim2 = 6;
- IndexType padedSizeDim3 = 12;
- IndexType padedSizeDim4 = 7;
- array<IndexType, 4> padedtensorRange = {{padedSizeDim1, padedSizeDim2, padedSizeDim3, padedSizeDim4}};
-
- Tensor<DataType, 4, DataLayout, IndexType> padded(padedtensorRange);
-
-
- DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
- DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(padded.size()*sizeof(DataType)));
- TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu1(gpu_data1, tensorRange);
- TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu2(gpu_data2, padedtensorRange);
-
- VERIFY_IS_EQUAL(padded.dimension(0), 2+0);
- VERIFY_IS_EQUAL(padded.dimension(1), 3+3);
- VERIFY_IS_EQUAL(padded.dimension(2), 5+7);
- VERIFY_IS_EQUAL(padded.dimension(3), 7+0);
- sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
- gpu2.device(sycl_device)=gpu1.pad(paddings);
- sycl_device.memcpyDeviceToHost(padded.data(), gpu_data2,(padded.size())*sizeof(DataType));
- for (IndexType i = 0; i < padedSizeDim1; ++i) {
- for (IndexType j = 0; j < padedSizeDim2; ++j) {
- for (IndexType k = 0; k < padedSizeDim3; ++k) {
- for (IndexType l = 0; l < padedSizeDim4; ++l) {
- if (j >= 2 && j < 5 && k >= 3 && k < 8) {
- VERIFY_IS_EQUAL(padded(i,j,k,l), tensor(i,j-2,k-3,l));
- } else {
- VERIFY_IS_EQUAL(padded(i,j,k,l), 0.0f);
- }
- }
- }
- }
- }
- sycl_device.deallocate(gpu_data1);
- sycl_device.deallocate(gpu_data2);
-}
-
-template<typename DataType, int DataLayout, typename IndexType>
-static void test_padded_expr(const Eigen::SyclDevice& sycl_device)
-{
- IndexType sizeDim1 = 2;
- IndexType sizeDim2 = 3;
- IndexType sizeDim3 = 5;
- IndexType sizeDim4 = 7;
- array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
-
- Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
- tensor.setRandom();
-
- array<std::pair<IndexType, IndexType>, 4> paddings;
- paddings[0] = std::make_pair(0, 0);
- paddings[1] = std::make_pair(2, 1);
- paddings[2] = std::make_pair(3, 4);
- paddings[3] = std::make_pair(0, 0);
-
- Eigen::DSizes<IndexType, 2> reshape_dims;
- reshape_dims[0] = 12;
- reshape_dims[1] = 84;
-
-
- Tensor<DataType, 2, DataLayout, IndexType> result(reshape_dims);
-
- DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
- DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(result.size()*sizeof(DataType)));
- TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu1(gpu_data1, tensorRange);
- TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu2(gpu_data2, reshape_dims);
-
-
- sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
- gpu2.device(sycl_device)=gpu1.pad(paddings).reshape(reshape_dims);
- sycl_device.memcpyDeviceToHost(result.data(), gpu_data2,(result.size())*sizeof(DataType));
-
- for (IndexType i = 0; i < 2; ++i) {
- for (IndexType j = 0; j < 6; ++j) {
- for (IndexType k = 0; k < 12; ++k) {
- for (IndexType l = 0; l < 7; ++l) {
- const float result_value = DataLayout == ColMajor ?
- result(i+2*j,k+12*l) : result(j+6*i,l+7*k);
- if (j >= 2 && j < 5 && k >= 3 && k < 8) {
- VERIFY_IS_EQUAL(result_value, tensor(i,j-2,k-3,l));
- } else {
- VERIFY_IS_EQUAL(result_value, 0.0f);
- }
- }
- }
- }
- }
- sycl_device.deallocate(gpu_data1);
- sycl_device.deallocate(gpu_data2);
-}
-
-template<typename DataType, typename dev_Selector> void sycl_padding_test_per_device(dev_Selector s){
- QueueInterface queueInterface(s);
- auto sycl_device = Eigen::SyclDevice(&queueInterface);
- test_simple_padding<DataType, RowMajor, int64_t>(sycl_device);
- test_simple_padding<DataType, ColMajor, int64_t>(sycl_device);
- test_padded_expr<DataType, RowMajor, int64_t>(sycl_device);
- test_padded_expr<DataType, ColMajor, int64_t>(sycl_device);
-
-}
-void test_cxx11_tensor_padding_sycl()
-{
- for (const auto& device :Eigen::get_sycl_supported_devices()) {
- CALL_SUBTEST(sycl_padding_test_per_device<float>(device));
- }
-}
diff --git a/eigen/unsupported/test/cxx11_tensor_reduction_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_reduction_sycl.cpp
index 440d48b..a9ef829 100644
--- a/eigen/unsupported/test/cxx11_tensor_reduction_sycl.cpp
+++ b/eigen/unsupported/test/cxx11_tensor_reduction_sycl.cpp
@@ -14,168 +14,125 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_reduction_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_SYCL
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
-template <typename DataType, int DataLayout, typename IndexType>
-static void test_full_reductions_mean_sycl(const Eigen::SyclDevice& sycl_device) {
- const IndexType num_rows = 452;
- const IndexType num_cols = 765;
- array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
+static void test_full_reductions_sycl(const Eigen::SyclDevice& sycl_device) {
- Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
- Tensor<DataType, 0, DataLayout, IndexType> full_redux;
- Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
-
- in.setRandom();
-
- full_redux = in.mean();
-
- DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
- DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType));
-
- TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange);
- TensorMap<Tensor<DataType, 0, DataLayout, IndexType> > out_gpu(gpu_out_data);
-
- sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
- out_gpu.device(sycl_device) = in_gpu.mean();
- sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType));
- // Check that the CPU and GPU reductions return the same result.
- VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
- sycl_device.deallocate(gpu_in_data);
- sycl_device.deallocate(gpu_out_data);
-}
+ const int num_rows = 452;
+ const int num_cols = 765;
+ array<int, 2> tensorRange = {{num_rows, num_cols}};
-
-template <typename DataType, int DataLayout, typename IndexType>
-static void test_full_reductions_min_sycl(const Eigen::SyclDevice& sycl_device) {
-
- const IndexType num_rows = 876;
- const IndexType num_cols = 953;
- array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
-
- Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
- Tensor<DataType, 0, DataLayout, IndexType> full_redux;
- Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
+ Tensor<float, 2> in(tensorRange);
+ Tensor<float, 0> full_redux;
+ Tensor<float, 0> full_redux_gpu;
in.setRandom();
- full_redux = in.minimum();
+ full_redux = in.sum();
- DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
- DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType));
+ float* gpu_in_data = static_cast<float*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(float)));
+ float* gpu_out_data =(float*)sycl_device.allocate(sizeof(float));
- TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange);
- TensorMap<Tensor<DataType, 0, DataLayout, IndexType> > out_gpu(gpu_out_data);
+ TensorMap<Tensor<float, 2> > in_gpu(gpu_in_data, tensorRange);
+ TensorMap<Tensor<float, 0> > out_gpu(gpu_out_data);
- sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
- out_gpu.device(sycl_device) = in_gpu.minimum();
- sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType));
+ sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(float));
+ out_gpu.device(sycl_device) = in_gpu.sum();
+ sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(float));
// Check that the CPU and GPU reductions return the same result.
VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+
sycl_device.deallocate(gpu_in_data);
sycl_device.deallocate(gpu_out_data);
}
+static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device) {
-template <typename DataType, int DataLayout, typename IndexType>
-static void test_first_dim_reductions_max_sycl(const Eigen::SyclDevice& sycl_device) {
-
- IndexType dim_x = 145;
- IndexType dim_y = 1;
- IndexType dim_z = 67;
+ int dim_x = 145;
+ int dim_y = 1;
+ int dim_z = 67;
- array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
- Eigen::array<IndexType, 1> red_axis;
+ array<int, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+ Eigen::array<int, 1> red_axis;
red_axis[0] = 0;
- array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}};
+ array<int, 2> reduced_tensorRange = {{dim_y, dim_z}};
- Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
- Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
- Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
+ Tensor<float, 3> in(tensorRange);
+ Tensor<float, 2> redux(reduced_tensorRange);
+ Tensor<float, 2> redux_gpu(reduced_tensorRange);
in.setRandom();
- redux= in.maximum(red_axis);
+ redux= in.sum(red_axis);
- DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
- DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType)));
+ float* gpu_in_data = static_cast<float*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(float)));
+ float* gpu_out_data = static_cast<float*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(float)));
- TensorMap<Tensor<DataType, 3, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange);
- TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > out_gpu(gpu_out_data, reduced_tensorRange);
+ TensorMap<Tensor<float, 3> > in_gpu(gpu_in_data, tensorRange);
+ TensorMap<Tensor<float, 2> > out_gpu(gpu_out_data, reduced_tensorRange);
- sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
- out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
- sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType));
+ sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(float));
+ out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
+ sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(float));
// Check that the CPU and GPU reductions return the same result.
- for(IndexType j=0; j<reduced_tensorRange[0]; j++ )
- for(IndexType k=0; k<reduced_tensorRange[1]; k++ )
+ for(int j=0; j<reduced_tensorRange[0]; j++ )
+ for(int k=0; k<reduced_tensorRange[1]; k++ )
VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k));
sycl_device.deallocate(gpu_in_data);
sycl_device.deallocate(gpu_out_data);
}
-template <typename DataType, int DataLayout, typename IndexType>
-static void test_last_dim_reductions_sum_sycl(const Eigen::SyclDevice &sycl_device) {
+static void test_last_dim_reductions_sycl(const Eigen::SyclDevice &sycl_device) {
- IndexType dim_x = 567;
- IndexType dim_y = 1;
- IndexType dim_z = 47;
+ int dim_x = 567;
+ int dim_y = 1;
+ int dim_z = 47;
- array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
- Eigen::array<IndexType, 1> red_axis;
+ array<int, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+ Eigen::array<int, 1> red_axis;
red_axis[0] = 2;
- array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}};
+ array<int, 2> reduced_tensorRange = {{dim_x, dim_y}};
- Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
- Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
- Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
+ Tensor<float, 3> in(tensorRange);
+ Tensor<float, 2> redux(reduced_tensorRange);
+ Tensor<float, 2> redux_gpu(reduced_tensorRange);
in.setRandom();
redux= in.sum(red_axis);
- DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
- DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType)));
+ float* gpu_in_data = static_cast<float*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(float)));
+ float* gpu_out_data = static_cast<float*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(float)));
- TensorMap<Tensor<DataType, 3, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange);
- TensorMap<Tensor<DataType, 2, DataLayout, IndexType> > out_gpu(gpu_out_data, reduced_tensorRange);
+ TensorMap<Tensor<float, 3> > in_gpu(gpu_in_data, tensorRange);
+ TensorMap<Tensor<float, 2> > out_gpu(gpu_out_data, reduced_tensorRange);
- sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
+ sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(float));
out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
- sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType));
+ sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(float));
// Check that the CPU and GPU reductions return the same result.
- for(IndexType j=0; j<reduced_tensorRange[0]; j++ )
- for(IndexType k=0; k<reduced_tensorRange[1]; k++ )
+ for(int j=0; j<reduced_tensorRange[0]; j++ )
+ for(int k=0; k<reduced_tensorRange[1]; k++ )
VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k));
sycl_device.deallocate(gpu_in_data);
sycl_device.deallocate(gpu_out_data);
}
-template<typename DataType> void sycl_reduction_test_per_device(const cl::sycl::device& d){
- std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
- QueueInterface queueInterface(d);
- auto sycl_device = Eigen::SyclDevice(&queueInterface);
-
- test_full_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
- test_full_reductions_min_sycl<DataType, RowMajor, int64_t>(sycl_device);
- test_first_dim_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
- test_last_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
- test_full_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
- test_full_reductions_min_sycl<DataType, ColMajor, int64_t>(sycl_device);
- test_first_dim_reductions_max_sycl<DataType, ColMajor, int64_t>(sycl_device);
- test_last_dim_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
-}
+
void test_cxx11_tensor_reduction_sycl() {
- for (const auto& device :Eigen::get_sycl_supported_devices()) {
- CALL_SUBTEST(sycl_reduction_test_per_device<float>(device));
- }
+ cl::sycl::gpu_selector s;
+ Eigen::SyclDevice sycl_device(s);
+ CALL_SUBTEST((test_full_reductions_sycl(sycl_device)));
+ CALL_SUBTEST((test_first_dim_reductions_sycl(sycl_device)));
+ CALL_SUBTEST((test_last_dim_reductions_sycl(sycl_device)));
+
}
diff --git a/eigen/unsupported/test/cxx11_tensor_reverse_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_reverse_sycl.cpp
deleted file mode 100644
index 2f54844..0000000
--- a/eigen/unsupported/test/cxx11_tensor_reverse_sycl.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015
-// Mehdi Goli Codeplay Software Ltd.
-// Ralph Potter Codeplay Software Ltd.
-// Luke Iwanski Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_reverse_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
-#define EIGEN_USE_SYCL
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-
-template <typename DataType, int DataLayout, typename IndexType>
-static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) {
-
- IndexType dim1 = 2;
- IndexType dim2 = 3;
- IndexType dim3 = 5;
- IndexType dim4 = 7;
-
- array<IndexType, 4> tensorRange = {{dim1, dim2, dim3, dim4}};
- Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
- Tensor<DataType, 4, DataLayout, IndexType> reversed_tensor(tensorRange);
- tensor.setRandom();
-
- array<bool, 4> dim_rev;
- dim_rev[0] = false;
- dim_rev[1] = true;
- dim_rev[2] = true;
- dim_rev[3] = false;
-
- DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType)));
- DataType* gpu_out_data =static_cast<DataType*>(sycl_device.allocate(reversed_tensor.dimensions().TotalSize()*sizeof(DataType)));
-
- TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange);
- TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu(gpu_out_data, tensorRange);
-
- sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType));
- out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
- sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType));
- // Check that the CPU and GPU reductions return the same result.
- for (IndexType i = 0; i < 2; ++i) {
- for (IndexType j = 0; j < 3; ++j) {
- for (IndexType k = 0; k < 5; ++k) {
- for (IndexType l = 0; l < 7; ++l) {
- VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(i,2-j,4-k,l));
- }
- }
- }
- }
- dim_rev[0] = true;
- dim_rev[1] = false;
- dim_rev[2] = false;
- dim_rev[3] = false;
-
- out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
- sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType));
-
- for (IndexType i = 0; i < 2; ++i) {
- for (IndexType j = 0; j < 3; ++j) {
- for (IndexType k = 0; k < 5; ++k) {
- for (IndexType l = 0; l < 7; ++l) {
- VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,l));
- }
- }
- }
- }
-
- dim_rev[0] = true;
- dim_rev[1] = false;
- dim_rev[2] = false;
- dim_rev[3] = true;
- out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
- sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType));
-
- for (IndexType i = 0; i < 2; ++i) {
- for (IndexType j = 0; j < 3; ++j) {
- for (IndexType k = 0; k < 5; ++k) {
- for (IndexType l = 0; l < 7; ++l) {
- VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,6-l));
- }
- }
- }
- }
-
- sycl_device.deallocate(gpu_in_data);
- sycl_device.deallocate(gpu_out_data);
-}
-
-
-
-template <typename DataType, int DataLayout, typename IndexType>
-static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue)
-{
- IndexType dim1 = 2;
- IndexType dim2 = 3;
- IndexType dim3 = 5;
- IndexType dim4 = 7;
-
- array<IndexType, 4> tensorRange = {{dim1, dim2, dim3, dim4}};
- Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
- Tensor<DataType, 4, DataLayout, IndexType> expected(tensorRange);
- Tensor<DataType, 4, DataLayout, IndexType> result(tensorRange);
- tensor.setRandom();
-
- array<bool, 4> dim_rev;
- dim_rev[0] = false;
- dim_rev[1] = true;
- dim_rev[2] = false;
- dim_rev[3] = true;
-
- DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType)));
- DataType* gpu_out_data_expected =static_cast<DataType*>(sycl_device.allocate(expected.dimensions().TotalSize()*sizeof(DataType)));
- DataType* gpu_out_data_result =static_cast<DataType*>(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType)));
-
- TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data, tensorRange);
- TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_expected(gpu_out_data_expected, tensorRange);
- TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_result(gpu_out_data_result, tensorRange);
-
-
- sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType));
-
- if (LValue) {
- out_gpu_expected.reverse(dim_rev).device(sycl_device) = in_gpu;
- } else {
- out_gpu_expected.device(sycl_device) = in_gpu.reverse(dim_rev);
- }
- sycl_device.memcpyDeviceToHost(expected.data(), gpu_out_data_expected, expected.dimensions().TotalSize()*sizeof(DataType));
-
-
- array<IndexType, 4> src_slice_dim;
- src_slice_dim[0] = 2;
- src_slice_dim[1] = 3;
- src_slice_dim[2] = 1;
- src_slice_dim[3] = 7;
- array<IndexType, 4> src_slice_start;
- src_slice_start[0] = 0;
- src_slice_start[1] = 0;
- src_slice_start[2] = 0;
- src_slice_start[3] = 0;
- array<IndexType, 4> dst_slice_dim = src_slice_dim;
- array<IndexType, 4> dst_slice_start = src_slice_start;
-
- for (IndexType i = 0; i < 5; ++i) {
- if (LValue) {
- out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) =
- in_gpu.slice(src_slice_start, src_slice_dim);
- } else {
- out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) =
- in_gpu.slice(src_slice_start, src_slice_dim).reverse(dim_rev);
- }
- src_slice_start[2] += 1;
- dst_slice_start[2] += 1;
- }
- sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType));
-
- for (IndexType i = 0; i < expected.dimension(0); ++i) {
- for (IndexType j = 0; j < expected.dimension(1); ++j) {
- for (IndexType k = 0; k < expected.dimension(2); ++k) {
- for (IndexType l = 0; l < expected.dimension(3); ++l) {
- VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
- }
- }
- }
- }
-
- dst_slice_start[2] = 0;
- result.setRandom();
- sycl_device.memcpyHostToDevice(gpu_out_data_result, result.data(),(result.dimensions().TotalSize())*sizeof(DataType));
- for (IndexType i = 0; i < 5; ++i) {
- if (LValue) {
- out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) =
- in_gpu.slice(dst_slice_start, dst_slice_dim);
- } else {
- out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) =
- in_gpu.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim);
- }
- dst_slice_start[2] += 1;
- }
- sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType));
-
- for (IndexType i = 0; i < expected.dimension(0); ++i) {
- for (IndexType j = 0; j < expected.dimension(1); ++j) {
- for (IndexType k = 0; k < expected.dimension(2); ++k) {
- for (IndexType l = 0; l < expected.dimension(3); ++l) {
- VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
- }
- }
- }
- }
-}
-
-
-
-template<typename DataType> void sycl_reverse_test_per_device(const cl::sycl::device& d){
- std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
- QueueInterface queueInterface(d);
- auto sycl_device = Eigen::SyclDevice(&queueInterface);
- test_simple_reverse<DataType, RowMajor, int64_t>(sycl_device);
- test_simple_reverse<DataType, ColMajor, int64_t>(sycl_device);
- test_expr_reverse<DataType, RowMajor, int64_t>(sycl_device, false);
- test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, false);
- test_expr_reverse<DataType, RowMajor, int64_t>(sycl_device, true);
- test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, true);
-}
-void test_cxx11_tensor_reverse_sycl() {
- for (const auto& device :Eigen::get_sycl_supported_devices()) {
- CALL_SUBTEST(sycl_reverse_test_per_device<float>(device));
- }
-}
diff --git a/eigen/unsupported/test/cxx11_tensor_shuffling_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
deleted file mode 100644
index c88db7c..0000000
--- a/eigen/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016
-// Mehdi Goli Codeplay Software Ltd.
-// Ralph Potter Codeplay Software Ltd.
-// Luke Iwanski Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-// Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_shuffling_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
-#define EIGEN_USE_SYCL
-
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-using Eigen::array;
-using Eigen::SyclDevice;
-using Eigen::Tensor;
-using Eigen::TensorMap;
-
-template <typename DataType, int DataLayout, typename IndexType>
-static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
-{
- IndexType sizeDim1 = 2;
- IndexType sizeDim2 = 3;
- IndexType sizeDim3 = 5;
- IndexType sizeDim4 = 7;
- array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
- Tensor<DataType, 4, DataLayout,IndexType> tensor(tensorRange);
- Tensor<DataType, 4, DataLayout,IndexType> no_shuffle(tensorRange);
- tensor.setRandom();
-
- const size_t buffSize =tensor.size()*sizeof(DataType);
- array<IndexType, 4> shuffles;
- shuffles[0] = 0;
- shuffles[1] = 1;
- shuffles[2] = 2;
- shuffles[3] = 3;
- DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(buffSize));
- DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(buffSize));
-
-
- TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu1(gpu_data1, tensorRange);
- TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu2(gpu_data2, tensorRange);
-
- sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(), buffSize);
-
- gpu2.device(sycl_device)=gpu1.shuffle(shuffles);
- sycl_device.memcpyDeviceToHost(no_shuffle.data(), gpu_data2, buffSize);
- sycl_device.synchronize();
-
- VERIFY_IS_EQUAL(no_shuffle.dimension(0), sizeDim1);
- VERIFY_IS_EQUAL(no_shuffle.dimension(1), sizeDim2);
- VERIFY_IS_EQUAL(no_shuffle.dimension(2), sizeDim3);
- VERIFY_IS_EQUAL(no_shuffle.dimension(3), sizeDim4);
-
- for (IndexType i = 0; i < sizeDim1; ++i) {
- for (IndexType j = 0; j < sizeDim2; ++j) {
- for (IndexType k = 0; k < sizeDim3; ++k) {
- for (IndexType l = 0; l < sizeDim4; ++l) {
- VERIFY_IS_EQUAL(tensor(i,j,k,l), no_shuffle(i,j,k,l));
- }
- }
- }
- }
-
- shuffles[0] = 2;
- shuffles[1] = 3;
- shuffles[2] = 1;
- shuffles[3] = 0;
- array<IndexType, 4> tensorrangeShuffle = {{sizeDim3, sizeDim4, sizeDim2, sizeDim1}};
- Tensor<DataType, 4, DataLayout,IndexType> shuffle(tensorrangeShuffle);
- DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(buffSize));
- TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu3(gpu_data3, tensorrangeShuffle);
-
- gpu3.device(sycl_device)=gpu1.shuffle(shuffles);
- sycl_device.memcpyDeviceToHost(shuffle.data(), gpu_data3, buffSize);
- sycl_device.synchronize();
-
- VERIFY_IS_EQUAL(shuffle.dimension(0), sizeDim3);
- VERIFY_IS_EQUAL(shuffle.dimension(1), sizeDim4);
- VERIFY_IS_EQUAL(shuffle.dimension(2), sizeDim2);
- VERIFY_IS_EQUAL(shuffle.dimension(3), sizeDim1);
-
- for (IndexType i = 0; i < sizeDim1; ++i) {
- for (IndexType j = 0; j < sizeDim2; ++j) {
- for (IndexType k = 0; k < sizeDim3; ++k) {
- for (IndexType l = 0; l < sizeDim4; ++l) {
- VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i));
- }
- }
- }
- }
-}
-
-
-template<typename DataType, typename dev_Selector> void sycl_shuffling_test_per_device(dev_Selector s){
- QueueInterface queueInterface(s);
- auto sycl_device = Eigen::SyclDevice(&queueInterface);
- test_simple_shuffling_sycl<DataType, RowMajor, int64_t>(sycl_device);
- test_simple_shuffling_sycl<DataType, ColMajor, int64_t>(sycl_device);
-
-}
-void test_cxx11_tensor_shuffling_sycl()
-{
- for (const auto& device :Eigen::get_sycl_supported_devices()) {
- CALL_SUBTEST(sycl_shuffling_test_per_device<float>(device));
- }
-}
diff --git a/eigen/unsupported/test/cxx11_tensor_striding_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_striding_sycl.cpp
deleted file mode 100644
index 603c374..0000000
--- a/eigen/unsupported/test/cxx11_tensor_striding_sycl.cpp
+++ /dev/null
@@ -1,203 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016
-// Mehdi Goli Codeplay Software Ltd.
-// Ralph Potter Codeplay Software Ltd.
-// Luke Iwanski Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_striding_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
-#define EIGEN_USE_SYCL
-
-#include <iostream>
-#include <chrono>
-#include <ctime>
-
-#include "main.h"
-#include <unsupported/Eigen/CXX11/Tensor>
-
-using Eigen::array;
-using Eigen::SyclDevice;
-using Eigen::Tensor;
-using Eigen::TensorMap;
-
-
-template <typename DataType, int DataLayout, typename IndexType>
-static void test_simple_striding(const Eigen::SyclDevice& sycl_device)
-{
-
- Eigen::array<IndexType, 4> tensor_dims = {{2,3,5,7}};
- Eigen::array<IndexType, 4> stride_dims = {{1,1,3,3}};
-
-
- Tensor<DataType, 4, DataLayout, IndexType> tensor(tensor_dims);
- Tensor<DataType, 4, DataLayout,IndexType> no_stride(tensor_dims);
- Tensor<DataType, 4, DataLayout,IndexType> stride(stride_dims);
-
-
- std::size_t tensor_bytes = tensor.size() * sizeof(DataType);
- std::size_t no_stride_bytes = no_stride.size() * sizeof(DataType);
- std::size_t stride_bytes = stride.size() * sizeof(DataType);
- DataType * d_tensor = static_cast<DataType*>(sycl_device.allocate(tensor_bytes));
- DataType * d_no_stride = static_cast<DataType*>(sycl_device.allocate(no_stride_bytes));
- DataType * d_stride = static_cast<DataType*>(sycl_device.allocate(stride_bytes));
-
- Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_tensor(d_tensor, tensor_dims);
- Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_no_stride(d_no_stride, tensor_dims);
- Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_stride(d_stride, stride_dims);
-
-
- tensor.setRandom();
- array<IndexType, 4> strides;
- strides[0] = 1;
- strides[1] = 1;
- strides[2] = 1;
- strides[3] = 1;
- sycl_device.memcpyHostToDevice(d_tensor, tensor.data(), tensor_bytes);
- gpu_no_stride.device(sycl_device)=gpu_tensor.stride(strides);
- sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes);
-
- //no_stride = tensor.stride(strides);
-
- VERIFY_IS_EQUAL(no_stride.dimension(0), 2);
- VERIFY_IS_EQUAL(no_stride.dimension(1), 3);
- VERIFY_IS_EQUAL(no_stride.dimension(2), 5);
- VERIFY_IS_EQUAL(no_stride.dimension(3), 7);
-
- for (IndexType i = 0; i < 2; ++i) {
- for (IndexType j = 0; j < 3; ++j) {
- for (IndexType k = 0; k < 5; ++k) {
- for (IndexType l = 0; l < 7; ++l) {
- VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l));
- }
- }
- }
- }
-
- strides[0] = 2;
- strides[1] = 4;
- strides[2] = 2;
- strides[3] = 3;
-//Tensor<float, 4, DataLayout> stride;
-// stride = tensor.stride(strides);
-
- gpu_stride.device(sycl_device)=gpu_tensor.stride(strides);
- sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes);
-
- VERIFY_IS_EQUAL(stride.dimension(0), 1);
- VERIFY_IS_EQUAL(stride.dimension(1), 1);
- VERIFY_IS_EQUAL(stride.dimension(2), 3);
- VERIFY_IS_EQUAL(stride.dimension(3), 3);
-
- for (IndexType i = 0; i < 1; ++i) {
- for (IndexType j = 0; j < 1; ++j) {
- for (IndexType k = 0; k < 3; ++k) {
- for (IndexType l = 0; l < 3; ++l) {
- VERIFY_IS_EQUAL(tensor(2*i,4*j,2*k,3*l), stride(i,j,k,l));
- }
- }
- }
- }
-
- sycl_device.deallocate(d_tensor);
- sycl_device.deallocate(d_no_stride);
- sycl_device.deallocate(d_stride);
-}
-
-template <typename DataType, int DataLayout, typename IndexType>
-static void test_striding_as_lvalue(const Eigen::SyclDevice& sycl_device)
-{
-
- Eigen::array<IndexType, 4> tensor_dims = {{2,3,5,7}};
- Eigen::array<IndexType, 4> stride_dims = {{3,12,10,21}};
-
-
- Tensor<DataType, 4, DataLayout, IndexType> tensor(tensor_dims);
- Tensor<DataType, 4, DataLayout,IndexType> no_stride(stride_dims);
- Tensor<DataType, 4, DataLayout,IndexType> stride(stride_dims);
-
-
- std::size_t tensor_bytes = tensor.size() * sizeof(DataType);
- std::size_t no_stride_bytes = no_stride.size() * sizeof(DataType);
- std::size_t stride_bytes = stride.size() * sizeof(DataType);
-
- DataType * d_tensor = static_cast<DataType*>(sycl_device.allocate(tensor_bytes));
- DataType * d_no_stride = static_cast<DataType*>(sycl_device.allocate(no_stride_bytes));
- DataType * d_stride = static_cast<DataType*>(sycl_device.allocate(stride_bytes));
-
- Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_tensor(d_tensor, tensor_dims);
- Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_no_stride(d_no_stride, stride_dims);
- Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_stride(d_stride, stride_dims);
-
- //Tensor<float, 4, DataLayout> tensor(2,3,5,7);
- tensor.setRandom();
- array<IndexType, 4> strides;
- strides[0] = 2;
- strides[1] = 4;
- strides[2] = 2;
- strides[3] = 3;
-
-// Tensor<float, 4, DataLayout> result(3, 12, 10, 21);
-// result.stride(strides) = tensor;
- sycl_device.memcpyHostToDevice(d_tensor, tensor.data(), tensor_bytes);
- gpu_stride.stride(strides).device(sycl_device)=gpu_tensor;
- sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes);
-
- for (IndexType i = 0; i < 2; ++i) {
- for (IndexType j = 0; j < 3; ++j) {
- for (IndexType k = 0; k < 5; ++k) {
- for (IndexType l = 0; l < 7; ++l) {
- VERIFY_IS_EQUAL(tensor(i,j,k,l), stride(2*i,4*j,2*k,3*l));
- }
- }
- }
- }
-
- array<IndexType, 4> no_strides;
- no_strides[0] = 1;
- no_strides[1] = 1;
- no_strides[2] = 1;
- no_strides[3] = 1;
-// Tensor<float, 4, DataLayout> result2(3, 12, 10, 21);
-// result2.stride(strides) = tensor.stride(no_strides);
-
- gpu_no_stride.stride(strides).device(sycl_device)=gpu_tensor.stride(no_strides);
- sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes);
-
- for (IndexType i = 0; i < 2; ++i) {
- for (IndexType j = 0; j < 3; ++j) {
- for (IndexType k = 0; k < 5; ++k) {
- for (IndexType l = 0; l < 7; ++l) {
- VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(2*i,4*j,2*k,3*l));
- }
- }
- }
- }
- sycl_device.deallocate(d_tensor);
- sycl_device.deallocate(d_no_stride);
- sycl_device.deallocate(d_stride);
-}
-
-
-template <typename Dev_selector> void tensorStridingPerDevice(Dev_selector& s){
- QueueInterface queueInterface(s);
- auto sycl_device=Eigen::SyclDevice(&queueInterface);
- test_simple_striding<float, ColMajor, int64_t>(sycl_device);
- test_simple_striding<float, RowMajor, int64_t>(sycl_device);
- test_striding_as_lvalue<float, ColMajor, int64_t>(sycl_device);
- test_striding_as_lvalue<float, RowMajor, int64_t>(sycl_device);
-}
-
-void test_cxx11_tensor_striding_sycl() {
- for (const auto& device :Eigen::get_sycl_supported_devices()) {
- CALL_SUBTEST(tensorStridingPerDevice(device));
- }
-}
diff --git a/eigen/unsupported/test/cxx11_tensor_sycl.cpp b/eigen/unsupported/test/cxx11_tensor_sycl.cpp
index 5cd0f4c..6a9c334 100644
--- a/eigen/unsupported/test/cxx11_tensor_sycl.cpp
+++ b/eigen/unsupported/test/cxx11_tensor_sycl.cpp
@@ -16,7 +16,7 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_SYCL
#include "main.h"
@@ -27,105 +27,36 @@ using Eigen::SyclDevice;
using Eigen::Tensor;
using Eigen::TensorMap;
-template <typename DataType, int DataLayout, typename IndexType>
-void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) {
- IndexType sizeDim1 = 100;
- IndexType sizeDim2 = 10;
- IndexType sizeDim3 = 20;
- array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
- Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange);
- Tensor<DataType, 3, DataLayout, IndexType> out1(tensorRange);
- Tensor<DataType, 3, DataLayout, IndexType> out2(tensorRange);
- Tensor<DataType, 3, DataLayout, IndexType> out3(tensorRange);
+void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
- in1 = in1.random();
-
- DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
- DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(out1.size()*sizeof(DataType)));
-
- TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
- TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange);
-
- sycl_device.memcpyHostToDevice(gpu_data1, in1.data(),(in1.size())*sizeof(DataType));
- sycl_device.memcpyHostToDevice(gpu_data2, in1.data(),(in1.size())*sizeof(DataType));
- gpu1.device(sycl_device) = gpu1 * 3.14f;
- gpu2.device(sycl_device) = gpu2 * 2.7f;
- sycl_device.memcpyDeviceToHost(out1.data(), gpu_data1,(out1.size())*sizeof(DataType));
- sycl_device.memcpyDeviceToHost(out2.data(), gpu_data1,(out2.size())*sizeof(DataType));
- sycl_device.memcpyDeviceToHost(out3.data(), gpu_data2,(out3.size())*sizeof(DataType));
- sycl_device.synchronize();
-
- for (IndexType i = 0; i < in1.size(); ++i) {
- VERIFY_IS_APPROX(out1(i), in1(i) * 3.14f);
- VERIFY_IS_APPROX(out2(i), in1(i) * 3.14f);
- VERIFY_IS_APPROX(out3(i), in1(i) * 2.7f);
- }
-
- sycl_device.deallocate(gpu_data1);
- sycl_device.deallocate(gpu_data2);
-}
-
-template <typename DataType, int DataLayout, typename IndexType>
-void test_sycl_mem_sync(const Eigen::SyclDevice &sycl_device) {
- IndexType size = 20;
- array<IndexType, 1> tensorRange = {{size}};
- Tensor<DataType, 1, DataLayout, IndexType> in1(tensorRange);
- Tensor<DataType, 1, DataLayout, IndexType> in2(tensorRange);
- Tensor<DataType, 1, DataLayout, IndexType> out(tensorRange);
-
- in1 = in1.random();
- in2 = in1;
-
- DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
-
- TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> gpu1(gpu_data, tensorRange);
- sycl_device.memcpyHostToDevice(gpu_data, in1.data(),(in1.size())*sizeof(DataType));
- sycl_device.synchronize();
- in1.setZero();
-
- sycl_device.memcpyDeviceToHost(out.data(), gpu_data, out.size()*sizeof(DataType));
- sycl_device.synchronize();
-
- for (IndexType i = 0; i < in1.size(); ++i) {
- VERIFY_IS_APPROX(out(i), in2(i));
- }
-
- sycl_device.deallocate(gpu_data);
-}
-
-template <typename DataType, int DataLayout, typename IndexType>
-void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
-
- IndexType sizeDim1 = 100;
- IndexType sizeDim2 = 10;
- IndexType sizeDim3 = 20;
- array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
- Tensor<DataType, 3,DataLayout, IndexType> in1(tensorRange);
- Tensor<DataType, 3,DataLayout, IndexType> in2(tensorRange);
- Tensor<DataType, 3,DataLayout, IndexType> in3(tensorRange);
- Tensor<DataType, 3,DataLayout, IndexType> out(tensorRange);
+ int sizeDim1 = 100;
+ int sizeDim2 = 100;
+ int sizeDim3 = 100;
+ array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+ Tensor<float, 3> in1(tensorRange);
+ Tensor<float, 3> in2(tensorRange);
+ Tensor<float, 3> in3(tensorRange);
+ Tensor<float, 3> out(tensorRange);
in2 = in2.random();
in3 = in3.random();
- DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
- DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(in2.size()*sizeof(DataType)));
- DataType * gpu_in3_data = static_cast<DataType*>(sycl_device.allocate(in3.size()*sizeof(DataType)));
- DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));
+ float * gpu_in1_data = static_cast<float*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(float)));
+ float * gpu_in2_data = static_cast<float*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(float)));
+ float * gpu_in3_data = static_cast<float*>(sycl_device.allocate(in3.dimensions().TotalSize()*sizeof(float)));
+ float * gpu_out_data = static_cast<float*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(float)));
- TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange);
- TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange);
- TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in3(gpu_in3_data, tensorRange);
- TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
+ TensorMap<Tensor<float, 3>> gpu_in1(gpu_in1_data, tensorRange);
+ TensorMap<Tensor<float, 3>> gpu_in2(gpu_in2_data, tensorRange);
+ TensorMap<Tensor<float, 3>> gpu_in3(gpu_in3_data, tensorRange);
+ TensorMap<Tensor<float, 3>> gpu_out(gpu_out_data, tensorRange);
/// a=1.2f
gpu_in1.device(sycl_device) = gpu_in1.constant(1.2f);
- sycl_device.memcpyDeviceToHost(in1.data(), gpu_in1_data ,(in1.size())*sizeof(DataType));
- sycl_device.synchronize();
-
- for (IndexType i = 0; i < sizeDim1; ++i) {
- for (IndexType j = 0; j < sizeDim2; ++j) {
- for (IndexType k = 0; k < sizeDim3; ++k) {
+ sycl_device.memcpyDeviceToHost(in1.data(), gpu_in1_data ,(in1.dimensions().TotalSize())*sizeof(float));
+ for (int i = 0; i < sizeDim1; ++i) {
+ for (int j = 0; j < sizeDim2; ++j) {
+ for (int k = 0; k < sizeDim3; ++k) {
VERIFY_IS_APPROX(in1(i,j,k), 1.2f);
}
}
@@ -134,12 +65,10 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
/// a=b*1.2f
gpu_out.device(sycl_device) = gpu_in1 * 1.2f;
- sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data ,(out.size())*sizeof(DataType));
- sycl_device.synchronize();
-
- for (IndexType i = 0; i < sizeDim1; ++i) {
- for (IndexType j = 0; j < sizeDim2; ++j) {
- for (IndexType k = 0; k < sizeDim3; ++k) {
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data ,(out.dimensions().TotalSize())*sizeof(float));
+ for (int i = 0; i < sizeDim1; ++i) {
+ for (int j = 0; j < sizeDim2; ++j) {
+ for (int k = 0; k < sizeDim3; ++k) {
VERIFY_IS_APPROX(out(i,j,k),
in1(i,j,k) * 1.2f);
}
@@ -148,14 +77,12 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
printf("a=b*1.2f Test Passed\n");
/// c=a*b
- sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.size())*sizeof(DataType));
+ sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(float));
gpu_out.device(sycl_device) = gpu_in1 * gpu_in2;
- sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
- sycl_device.synchronize();
-
- for (IndexType i = 0; i < sizeDim1; ++i) {
- for (IndexType j = 0; j < sizeDim2; ++j) {
- for (IndexType k = 0; k < sizeDim3; ++k) {
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
+ for (int i = 0; i < sizeDim1; ++i) {
+ for (int j = 0; j < sizeDim2; ++j) {
+ for (int k = 0; k < sizeDim3; ++k) {
VERIFY_IS_APPROX(out(i,j,k),
in1(i,j,k) *
in2(i,j,k));
@@ -166,11 +93,10 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
/// c=a+b
gpu_out.device(sycl_device) = gpu_in1 + gpu_in2;
- sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
- sycl_device.synchronize();
- for (IndexType i = 0; i < sizeDim1; ++i) {
- for (IndexType j = 0; j < sizeDim2; ++j) {
- for (IndexType k = 0; k < sizeDim3; ++k) {
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
+ for (int i = 0; i < sizeDim1; ++i) {
+ for (int j = 0; j < sizeDim2; ++j) {
+ for (int k = 0; k < sizeDim3; ++k) {
VERIFY_IS_APPROX(out(i,j,k),
in1(i,j,k) +
in2(i,j,k));
@@ -181,11 +107,10 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
/// c=a*a
gpu_out.device(sycl_device) = gpu_in1 * gpu_in1;
- sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
- sycl_device.synchronize();
- for (IndexType i = 0; i < sizeDim1; ++i) {
- for (IndexType j = 0; j < sizeDim2; ++j) {
- for (IndexType k = 0; k < sizeDim3; ++k) {
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
+ for (int i = 0; i < sizeDim1; ++i) {
+ for (int j = 0; j < sizeDim2; ++j) {
+ for (int k = 0; k < sizeDim3; ++k) {
VERIFY_IS_APPROX(out(i,j,k),
in1(i,j,k) *
in1(i,j,k));
@@ -196,11 +121,10 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
//a*3.14f + b*2.7f
gpu_out.device(sycl_device) = gpu_in1 * gpu_in1.constant(3.14f) + gpu_in2 * gpu_in2.constant(2.7f);
- sycl_device.memcpyDeviceToHost(out.data(),gpu_out_data,(out.size())*sizeof(DataType));
- sycl_device.synchronize();
- for (IndexType i = 0; i < sizeDim1; ++i) {
- for (IndexType j = 0; j < sizeDim2; ++j) {
- for (IndexType k = 0; k < sizeDim3; ++k) {
+ sycl_device.memcpyDeviceToHost(out.data(),gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
+ for (int i = 0; i < sizeDim1; ++i) {
+ for (int j = 0; j < sizeDim2; ++j) {
+ for (int k = 0; k < sizeDim3; ++k) {
VERIFY_IS_APPROX(out(i,j,k),
in1(i,j,k) * 3.14f
+ in2(i,j,k) * 2.7f);
@@ -210,13 +134,12 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
printf("a*3.14f + b*2.7f Test Passed\n");
///d= (a>0.5? b:c)
- sycl_device.memcpyHostToDevice(gpu_in3_data, in3.data(),(in3.size())*sizeof(DataType));
+ sycl_device.memcpyHostToDevice(gpu_in3_data, in3.data(),(in3.dimensions().TotalSize())*sizeof(float));
gpu_out.device(sycl_device) =(gpu_in1 > gpu_in1.constant(0.5f)).select(gpu_in2, gpu_in3);
- sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
- sycl_device.synchronize();
- for (IndexType i = 0; i < sizeDim1; ++i) {
- for (IndexType j = 0; j < sizeDim2; ++j) {
- for (IndexType k = 0; k < sizeDim3; ++k) {
+ sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
+ for (int i = 0; i < sizeDim1; ++i) {
+ for (int j = 0; j < sizeDim2; ++j) {
+ for (int k = 0; k < sizeDim3; ++k) {
VERIFY_IS_APPROX(out(i, j, k), (in1(i, j, k) > 0.5f)
? in2(i, j, k)
: in3(i, j, k));
@@ -229,48 +152,8 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
sycl_device.deallocate(gpu_in3_data);
sycl_device.deallocate(gpu_out_data);
}
-template<typename Scalar1, typename Scalar2, int DataLayout, typename IndexType>
-static void test_sycl_cast(const Eigen::SyclDevice& sycl_device){
- IndexType size = 20;
- array<IndexType, 1> tensorRange = {{size}};
- Tensor<Scalar1, 1, DataLayout, IndexType> in(tensorRange);
- Tensor<Scalar2, 1, DataLayout, IndexType> out(tensorRange);
- Tensor<Scalar2, 1, DataLayout, IndexType> out_host(tensorRange);
-
- in = in.random();
-
- Scalar1* gpu_in_data = static_cast<Scalar1*>(sycl_device.allocate(in.size()*sizeof(Scalar1)));
- Scalar2 * gpu_out_data = static_cast<Scalar2*>(sycl_device.allocate(out.size()*sizeof(Scalar2)));
-
- TensorMap<Tensor<Scalar1, 1, DataLayout, IndexType>> gpu_in(gpu_in_data, tensorRange);
- TensorMap<Tensor<Scalar2, 1, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
- sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.size())*sizeof(Scalar1));
- gpu_out.device(sycl_device) = gpu_in. template cast<Scalar2>();
- sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data, out.size()*sizeof(Scalar2));
- out_host = in. template cast<Scalar2>();
- for(IndexType i=0; i< size; i++)
- {
- VERIFY_IS_APPROX(out(i), out_host(i));
- }
- printf("cast Test Passed\n");
- sycl_device.deallocate(gpu_in_data);
- sycl_device.deallocate(gpu_out_data);
-}
-template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){
- QueueInterface queueInterface(s);
- auto sycl_device = Eigen::SyclDevice(&queueInterface);
- test_sycl_mem_transfers<DataType, RowMajor, int64_t>(sycl_device);
- test_sycl_computations<DataType, RowMajor, int64_t>(sycl_device);
- test_sycl_mem_sync<DataType, RowMajor, int64_t>(sycl_device);
- test_sycl_mem_transfers<DataType, ColMajor, int64_t>(sycl_device);
- test_sycl_computations<DataType, ColMajor, int64_t>(sycl_device);
- test_sycl_mem_sync<DataType, ColMajor, int64_t>(sycl_device);
- test_sycl_cast<DataType, int, RowMajor, int64_t>(sycl_device);
- test_sycl_cast<DataType, int, ColMajor, int64_t>(sycl_device);
-}
-
void test_cxx11_tensor_sycl() {
- for (const auto& device :Eigen::get_sycl_supported_devices()) {
- CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
- }
+ cl::sycl::gpu_selector s;
+ Eigen::SyclDevice sycl_device(s);
+ CALL_SUBTEST(test_sycl_cpu(sycl_device));
}
diff --git a/eigen/unsupported/test/polynomialsolver.cpp b/eigen/unsupported/test/polynomialsolver.cpp
index 7ad4aa6..0c87478 100644
--- a/eigen/unsupported/test/polynomialsolver.cpp
+++ b/eigen/unsupported/test/polynomialsolver.cpp
@@ -32,10 +32,9 @@ bool aux_evalSolver( const POLYNOMIAL& pols, SOLVER& psolve )
{
typedef typename POLYNOMIAL::Index Index;
typedef typename POLYNOMIAL::Scalar Scalar;
- typedef typename POLYNOMIAL::RealScalar RealScalar;
typedef typename SOLVER::RootsType RootsType;
- typedef Matrix<RealScalar,Deg,1> EvalRootsType;
+ typedef Matrix<Scalar,Deg,1> EvalRootsType;
const Index deg = pols.size()-1;
@@ -58,7 +57,7 @@ bool aux_evalSolver( const POLYNOMIAL& pols, SOLVER& psolve )
cerr << endl;
}
- std::vector<RealScalar> rootModuli( roots.size() );
+ std::vector<Scalar> rootModuli( roots.size() );
Map< EvalRootsType > aux( &rootModuli[0], roots.size() );
aux = roots.array().abs();
std::sort( rootModuli.begin(), rootModuli.end() );
@@ -84,7 +83,7 @@ void evalSolver( const POLYNOMIAL& pols )
{
typedef typename POLYNOMIAL::Scalar Scalar;
- typedef PolynomialSolver<Scalar, Deg > PolynomialSolverType;
+ typedef PolynomialSolver<Scalar, Deg > PolynomialSolverType;
PolynomialSolverType psolve;
aux_evalSolver<Deg, POLYNOMIAL, PolynomialSolverType>( pols, psolve );
@@ -98,7 +97,6 @@ void evalSolverSugarFunction( const POLYNOMIAL& pols, const ROOTS& roots, const
{
using std::sqrt;
typedef typename POLYNOMIAL::Scalar Scalar;
- typedef typename POLYNOMIAL::RealScalar RealScalar;
typedef PolynomialSolver<Scalar, Deg > PolynomialSolverType;
@@ -109,12 +107,15 @@ void evalSolverSugarFunction( const POLYNOMIAL& pols, const ROOTS& roots, const
// 1) the roots found are correct
// 2) the roots have distinct moduli
+ typedef typename POLYNOMIAL::Scalar Scalar;
+ typedef typename REAL_ROOTS::Scalar Real;
+
//Test realRoots
- std::vector< RealScalar > calc_realRoots;
- psolve.realRoots( calc_realRoots, test_precision<RealScalar>());
- VERIFY_IS_EQUAL( calc_realRoots.size() , (size_t)real_roots.size() );
+ std::vector< Real > calc_realRoots;
+ psolve.realRoots( calc_realRoots );
+ VERIFY( calc_realRoots.size() == (size_t)real_roots.size() );
- const RealScalar psPrec = sqrt( test_precision<RealScalar>() );
+ const Scalar psPrec = sqrt( test_precision<Scalar>() );
for( size_t i=0; i<calc_realRoots.size(); ++i )
{
@@ -137,7 +138,7 @@ void evalSolverSugarFunction( const POLYNOMIAL& pols, const ROOTS& roots, const
bool hasRealRoot;
//Test absGreatestRealRoot
- RealScalar r = psolve.absGreatestRealRoot( hasRealRoot );
+ Real r = psolve.absGreatestRealRoot( hasRealRoot );
VERIFY( hasRealRoot == (real_roots.size() > 0 ) );
if( hasRealRoot ){
VERIFY( internal::isApprox( real_roots.array().abs().maxCoeff(), abs(r), psPrec ) ); }
@@ -166,11 +167,9 @@ void evalSolverSugarFunction( const POLYNOMIAL& pols, const ROOTS& roots, const
template<typename _Scalar, int _Deg>
void polynomialsolver(int deg)
{
- typedef typename NumTraits<_Scalar>::Real RealScalar;
- typedef internal::increment_if_fixed_size<_Deg> Dim;
+ typedef internal::increment_if_fixed_size<_Deg> Dim;
typedef Matrix<_Scalar,Dim::ret,1> PolynomialType;
typedef Matrix<_Scalar,_Deg,1> EvalRootsType;
- typedef Matrix<RealScalar,_Deg,1> RealRootsType;
cout << "Standard cases" << endl;
PolynomialType pols = PolynomialType::Random(deg+1);
@@ -183,11 +182,15 @@ void polynomialsolver(int deg)
evalSolver<_Deg,PolynomialType>( pols );
cout << "Test sugar" << endl;
- RealRootsType realRoots = RealRootsType::Random(deg);
+ EvalRootsType realRoots = EvalRootsType::Random(deg);
roots_to_monicPolynomial( realRoots, pols );
evalSolverSugarFunction<_Deg>(
pols,
- realRoots.template cast <std::complex<RealScalar> >().eval(),
+ realRoots.template cast <
+ std::complex<
+ typename NumTraits<_Scalar>::Real
+ >
+ >(),
realRoots );
}
@@ -211,6 +214,5 @@ void test_polynomialsolver()
internal::random<int>(9,13)
)) );
CALL_SUBTEST_11((polynomialsolver<float,Dynamic>(1)) );
- CALL_SUBTEST_12((polynomialsolver<std::complex<double>,Dynamic>(internal::random<int>(2,13))) );
}
}
diff --git a/eigen/unsupported/test/sparse_extra.cpp b/eigen/unsupported/test/sparse_extra.cpp
index 4f6723d..a010ceb 100644
--- a/eigen/unsupported/test/sparse_extra.cpp
+++ b/eigen/unsupported/test/sparse_extra.cpp
@@ -129,19 +129,6 @@ template<typename SparseMatrixType> void sparse_extra(const SparseMatrixType& re
}
-template<typename SparseMatrixType>
-void check_marketio()
-{
- typedef Matrix<typename SparseMatrixType::Scalar, Dynamic, Dynamic> DenseMatrix;
- Index rows = internal::random<Index>(1,100);
- Index cols = internal::random<Index>(1,100);
- SparseMatrixType m1, m2;
- m1 = DenseMatrix::Random(rows, cols).sparseView();
- saveMarket(m1, "sparse_extra.mtx");
- loadMarket(m2, "sparse_extra.mtx");
- VERIFY_IS_EQUAL(DenseMatrix(m1),DenseMatrix(m2));
-}
-
void test_sparse_extra()
{
for(int i = 0; i < g_repeat; i++) {
@@ -156,15 +143,5 @@ void test_sparse_extra()
CALL_SUBTEST_3( (sparse_product<DynamicSparseMatrix<float, ColMajor> >()) );
CALL_SUBTEST_3( (sparse_product<DynamicSparseMatrix<float, RowMajor> >()) );
-
- CALL_SUBTEST_4( (check_marketio<SparseMatrix<float,ColMajor,int> >()) );
- CALL_SUBTEST_4( (check_marketio<SparseMatrix<double,ColMajor,int> >()) );
- CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<float>,ColMajor,int> >()) );
- CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<double>,ColMajor,int> >()) );
- CALL_SUBTEST_4( (check_marketio<SparseMatrix<float,ColMajor,long int> >()) );
- CALL_SUBTEST_4( (check_marketio<SparseMatrix<double,ColMajor,long int> >()) );
- CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<float>,ColMajor,long int> >()) );
- CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<double>,ColMajor,long int> >()) );
- TEST_SET_BUT_UNUSED_VARIABLE(s);
}
}