59 files changed, 707 insertions, 3490 deletions
diff --git a/eigen/unsupported/Eigen/CXX11/Tensor b/eigen/unsupported/Eigen/CXX11/Tensor
index 3991609..7ecb4c7 100644
--- a/eigen/unsupported/Eigen/CXX11/Tensor
+++ b/eigen/unsupported/Eigen/CXX11/Tensor
@@ -13,14 +13,13 @@
 
 #include "../../../Eigen/Core"
 
-#if defined(EIGEN_USE_SYCL)
+#ifdef EIGEN_USE_SYCL
 #undef min
 #undef max
 #undef isnan
 #undef isinf
 #undef isfinite
 #include <SYCL/sycl.hpp>
-#include <iostream>
 #include <map>
 #include <memory>
 #include <utility>
@@ -53,10 +52,8 @@ typedef __int32 int32_t;
 typedef unsigned __int32 uint32_t;
 typedef __int64 int64_t;
 typedef unsigned __int64 uint64_t;
-#include <windows.h>
 #else
 #include <stdint.h>
-#include <unistd.h>
 #endif
 
 #if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900
@@ -71,10 +68,6 @@ typedef unsigned __int64 uint64_t;
 #include <time.h>
 #endif
 
-#if defined(EIGEN_USE_LIBXSMM)
-#include "libxsmm.h"
-#endif
-
 #ifdef EIGEN_USE_THREADS
 #include "ThreadPool"
 #endif
diff --git a/eigen/unsupported/Eigen/CXX11/ThreadPool b/eigen/unsupported/Eigen/CXX11/ThreadPool
index c346141..09d637e 100644
--- a/eigen/unsupported/Eigen/CXX11/ThreadPool
+++ b/eigen/unsupported/Eigen/CXX11/ThreadPool
@@ -50,7 +50,6 @@
 
 #include "src/ThreadPool/ThreadLocal.h"
 #include "src/ThreadPool/ThreadYield.h"
-#include "src/ThreadPool/ThreadCancel.h"
 #include "src/ThreadPool/EventCount.h"
 #include "src/ThreadPool/RunQueue.h"
 #include "src/ThreadPool/ThreadPoolInterface.h"
@@ -58,18 +57,6 @@
 #include "src/ThreadPool/SimpleThreadPool.h"
 #include "src/ThreadPool/NonBlockingThreadPool.h"
 
-
-// Use the more efficient NonBlockingThreadPool by default.
-namespace Eigen {
-#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
-template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
-typedef NonBlockingThreadPool ThreadPool;
-#else
-template <typename Env> using ThreadPoolTempl = SimpleThreadPoolTempl<Env>;
-typedef SimpleThreadPool ThreadPool;
-#endif
-}  // namespace Eigen
-
 #endif
 
 #include <Eigen/src/Core/util/ReenableStupidWarnings.h>
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md b/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md
index 38cdb9c..98e8381 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md
@@ -1737,9 +1737,11 @@ TODO
 
 ## Representation of scalar values
 
-Scalar values are often represented by tensors of size 1 and rank 0.For example
-Tensor<T, N>::maximum() currently returns a Tensor<T, 0>. Similarly, the inner
-product of 2 1d tensors (through contractions) returns a 0d tensor.
+Scalar values are often represented by tensors of size 1 and rank 1. It would be
+more logical and user friendly to use tensors of rank 0 instead. For example
+Tensor<T, N>::maximum() currently returns a Tensor<T, 1>. Similarly, the inner
+product of 2 1d tensors (through contractions) returns a 1d tensor. In the
+future these operations might be updated to return 0d tensors instead.
 
 ## Limitations
 
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index fbe3408..7a45a5c 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -186,12 +186,6 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_expm1_op<Scalar>, const Derived>
-    expm1() const {
-      return unaryExpr(internal::scalar_expm1_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived>
     log() const {
       return unaryExpr(internal::scalar_log_op<Scalar>());
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 23a7446..4cfe300 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -54,7 +54,7 @@ struct is_input_scalar<Sizes<> > {
   static const bool value = true;
 };
 #ifndef EIGEN_EMULATE_CXX11_META_H
-template <typename std::ptrdiff_t... Indices>
+template <typename std::size_t... Indices>
 struct is_input_scalar<Sizes<Indices...> > {
   static const bool value = (Sizes<Indices...>::total_size == 1);
 };
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index c46a778..1ba7ef1 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -50,7 +50,6 @@ template <DenseIndex DimId>
 struct DimensionId
 {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) {
-    EIGEN_UNUSED_VARIABLE(dim);
     eigen_assert(dim == DimId);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {
@@ -151,7 +150,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device), m_offset(op.offset())
+      : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device)
   {
     EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
     eigen_assert(NumInputDims > m_dim.actualDim());
@@ -207,7 +206,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
-  (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
+	(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
       // m_stride is equal to 1, so let's avoid the integer division.
       eigen_assert(m_stride == 1);
       Index inputIndex = index * m_inputStride + m_inputOffset;
@@ -219,7 +218,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
       PacketReturnType rslt = internal::pload<PacketReturnType>(values);
       return rslt;
     } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims - 1) ||
-         (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
+	       (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
       // m_stride is aways greater than index, so let's avoid the integer division.
       eigen_assert(m_stride > index);
       return m_impl.template packet<LoadMode>(index + m_inputOffset);
@@ -275,29 +274,17 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
     }
   }
 
-  /// used by sycl
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex dimId() const {
-    return m_dim.actualDim();
-  }
-
-  /// used by sycl
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DenseIndex& offset() const {
-    return m_offset;
-  }
-  /// required by sycl in order to extract the accessor
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
-
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
   {
     Index inputIndex;
     if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
-  (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
+	(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
       // m_stride is equal to 1, so let's avoid the integer division.
       eigen_assert(m_stride == 1);
       inputIndex = index * m_inputStride + m_inputOffset;
     } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims-1) ||
-         (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
+	       (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
       // m_stride is aways greater than index, so let's avoid the integer division.
       eigen_assert(m_stride > index);
       inputIndex = index + m_inputOffset;
@@ -317,9 +304,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
   TensorEvaluator<ArgType, Device> m_impl;
   const internal::DimensionId<DimId> m_dim;
   const Device& m_device;
-// required by sycl
-  const DenseIndex m_offset;
-
 };
 
 
@@ -360,7 +344,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
     EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
 
     if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == 0) ||
-  (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) {
+	(static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) {
       // m_stride is equal to 1, so let's avoid the integer division.
       eigen_assert(this->m_stride == 1);
       EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
@@ -371,7 +355,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
         inputIndex += this->m_inputStride;
       }
     } else if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == NumInputDims-1) ||
-         (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == 0)) {
+	       (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == 0)) {
       // m_stride is aways greater than index, so let's avoid the integer division.
       eigen_assert(this->m_stride > index);
       this->m_impl.template writePacket<StoreMode>(index + this->m_inputOffset, x);
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index 2c7ba96..59bf90d 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -276,12 +276,6 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
   }
 
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<RightArgType, Device>& right_impl() const { return m_rightImpl; }
-  /// required by sycl in order to extract the accessor
-  const Axis& axis() const { return m_axis; }
 
   protected:
     Dimensions m_dimensions;
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index bf4a476..20b29e5 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -20,70 +20,6 @@ namespace Eigen {
   *
   */
 namespace internal {
-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
-template<typename Scalar, typename Index>
-void pack_simple(Scalar * dst, const Scalar * src, Index cols, Index rows, Index lddst, Index ldsrc) {
-  size_t psize = packet_traits<Scalar>::size;           // Packet size
-  typedef typename packet_traits<Scalar>::type Packet;  // Packet type
-  size_t alignment = psize*sizeof(Scalar);              // Needed alignment
-  if (rows % psize == 0 && (lddst*sizeof(Scalar)) % alignment == 0 &&
-     (ldsrc*sizeof(Scalar)) % alignment == 0 &&
-     reinterpret_cast<uintptr_t>(src) % alignment == 0 &&
-     reinterpret_cast<uintptr_t>(dst) % alignment == 0) {
-    // Optimized version using packets
-    size_t num_packets = rows / psize;
-    for (Index col = 0; col < cols; ++col) {
-      EIGEN_ASM_COMMENT("begin pack_simple inner copy");
-      // Unrolled manually 4 times.
-      for (size_t i=0; i < num_packets/4; ++i) {
-        internal::pstore(dst, internal::pload<Packet>(src));
-        dst += psize; src += psize;
-        internal::pstore(dst, internal::pload<Packet>(src));
-        dst += psize; src += psize;
-        internal::pstore(dst, internal::pload<Packet>(src));
-        dst += psize; src += psize;
-        internal::pstore(dst, internal::pload<Packet>(src));
-        dst += psize; src += psize;
-      }
-      for (size_t i=0; i < num_packets%4; ++i) {
-        internal::pstore(dst, internal::pload<Packet>(src));
-        dst += psize; src += psize;
-      }
-      dst += lddst - num_packets*psize;
-      src += ldsrc - num_packets*psize;
-      EIGEN_ASM_COMMENT("end pack_simple inner copy");
-    }
-  } else {
-    // Naive memcpy calls
-    for (Index col = 0; col < cols; ++col) {
-      memcpy(dst + col*lddst, src + col*ldsrc, rows*sizeof(Scalar));
-    }
-  }
-}
-
-template<typename LhsScalar, typename RhsScalar, typename Scalar>
-  struct libxsmm_wrapper {
-    libxsmm_wrapper() {}
-    libxsmm_wrapper(int, int, int, int, int, int, int, float, float, int) {}
-    void operator()(const LhsScalar*, const RhsScalar*, Scalar*) {}
-    void operator()(const LhsScalar*, const RhsScalar*, Scalar*, const LhsScalar*, const RhsScalar*, const Scalar*) {}
-  };
-
-  template<>
-  struct libxsmm_wrapper<float, float, float>: public libxsmm_mmfunction<float> {
-    libxsmm_wrapper(): libxsmm_mmfunction() {}
-    libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) :
-        libxsmm_mmfunction(flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch) {}
-  };
-
-  template<>
-  struct libxsmm_wrapper<double, double, double>: public libxsmm_mmfunction<double> {
-    libxsmm_wrapper(): libxsmm_mmfunction() {}
-    libxsmm_wrapper(int flags, int m, int n, int k, int lda, int ldb, int ldc, float alpha, float beta, int prefetch) :
-        libxsmm_mmfunction(flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch) {}
-  };
-#endif
-
 
 template<typename Dimensions, typename LhsXprType, typename RhsXprType>
 struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
@@ -222,7 +158,7 @@ struct TensorContractionEvaluatorBase
         m_device(device),
         m_result(NULL) {
     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
-         static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
+			   static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
                         YOU_MADE_A_PROGRAMMING_MISTAKE);
 
 
@@ -381,8 +317,6 @@ struct TensorContractionEvaluatorBase
       }
     }
 
-    EnableXSMMIfPossible(eval_op_indices);
-
     // If the layout is RowMajor, we need to reverse the m_dimensions
     if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) {
       for (int i = 0, j = NumDims - 1; i < j; i++, j--) {
@@ -393,7 +327,7 @@ struct TensorContractionEvaluatorBase
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar * data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
     m_leftImpl.evalSubExprsIfNeeded(NULL);
     m_rightImpl.evalSubExprsIfNeeded(NULL);
     if (data) {
@@ -488,13 +422,6 @@ struct TensorContractionEvaluatorBase
 
   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
   EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const {
-    #if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
-    if (m_can_use_xsmm) {
-      evalGemmXSMM(buffer);
-      return;
-    }
-    #endif
-
     // columns in left side, rows in right side
     const Index k = this->m_k_size;
 
@@ -611,214 +538,7 @@ struct TensorContractionEvaluatorBase
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; }
 
-protected:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void EnableXSMMIfPossible(const array<IndexPair<Index>, ContractDims>& eval_op_indices) {
-    m_can_use_xsmm = false;
-
-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
-    typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
-    typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-    if (!std::is_same<Scalar, LhsScalar>::value ||
-        !std::is_same<Scalar, RhsScalar>::value ||
-        !(std::is_same<Scalar, float>::value ||
-          std::is_same<Scalar, double>::value) ||
-        m_leftImpl.data() == NULL ||
-        m_rightImpl.data() == NULL) {
-      return;
-    }
-
-    // Check if we can use faster matmul algorithms. For contraction to be
-    // equivalent to matmul, we need both lhs and rhs contracting dims sequences
-    // to be either a prefix or suffix of all dims. Also, the order of both
-    // must be the same, so we don't have to do reordering.
-    // For example:
-    // * OK: lhs 4D, rhs 4D, contraction: [(0, 2), (1, 3)]
-    // * BAD: lhs 3D, rhs 3D, contraction: [(1,1)]
-    // * BAD: lhs 3D, rhs 3D, contraction: [(0, 0), (2, 2)]
-    // * BAD: lhs 3D, rhs 3D, contraction: [(0, 2), (1, 1)]
-    // Depending if contraction dims are prefix or suffix of all dims we need to
-    // pre-transpose matrices in matmul algorithm:
-    // lhs: prefix -> transpose, suffix -> no transpose
-    // rhs: prefix -> no transpose, suffix -> transpose
-    // For example, for lhs 2D, rhs 2D, contraction [(1, 0)] is regular,
-    // non-transposed matmul.
-    if (ContractDims == 0) {
-      // This case is totally uninteresting, filter it out to avoid problems
-      // with iterations in further tests.
-      return;
-    }
-
-    // Check if RHS dims list is increasing. LHS already is, so if not, the
-    // order is different and we cannot do matmul.
-    for (int i = 1; i < ContractDims; i++) {
-      if (eval_op_indices[i].second < eval_op_indices[i-1].second) {
-        return;
-      }
-    }
-
-    // Check if no holes.
-    int diff;
-    for (int i = 1; i < ContractDims; i++) {
-      // LHS contract dims are sorted to form an increasing seq.
-      diff = eval_op_indices[i].first - eval_op_indices[i-1].first;
-      if (diff != 1) {
-        return;
-      }
-      // Now we may already assume RHS contract dims seq is increasing too.
-      diff = eval_op_indices[i].second - eval_op_indices[i-1].second;
-      if (diff != 1) {
-        return;
-      }
-    }
-
-    // Check if suffix or prefix.
-    if (eval_op_indices[0].first != 0 &&
-        eval_op_indices[ContractDims-1].first != LDims-1) {
-      return;
-    }
-    if (eval_op_indices[0].second != 0 &&
-        eval_op_indices[ContractDims-1].second != RDims-1) {
-      return;
-    }
-
-    m_can_use_xsmm = true;
-#else
-    EIGEN_UNUSED_VARIABLE(eval_op_indices);
-#endif
-  }
-
-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
-  EIGEN_DEVICE_FUNC void evalGemmXSMM(Scalar* buffer) const {
-    // columns in left side, rows in right side
-    const Index k = this->m_k_size;
-
-    // rows in left side
-    const Index m = this->m_i_size;
-
-    // columns in right side
-    const Index n = this->m_j_size;
-
-    const bool transposeA = !m_lhs_inner_dim_contiguous;
-    const bool transposeB = !m_rhs_inner_dim_contiguous;
-
-    typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
-    typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-
-    internal::TensorXsmmContractionBlocking<LhsScalar, RhsScalar, Index> blocking(
-        k, m, n, 1, transposeA, transposeB);
-
-    // Outer blocks sizes
-    const Index mc_outer = blocking.outer_m();
-    const Index nc_outer = blocking.outer_n();
-    const Index kc_outer = blocking.outer_k();
-    // Inner blocks sizes
-    const Index mc = blocking.mc();
-    const Index nc = blocking.nc();
-    const Index kc = blocking.kc();
-    // Decisions whether we should copy parts of matrices
-    const bool copyA = blocking.copyA();
-    const bool copyB = blocking.copyB();
-
-    const LhsScalar* leftData = m_leftImpl.data();
-    const RhsScalar* rightData = m_rightImpl.data();
-
-    const libxsmm_blasint stride_A = static_cast<libxsmm_blasint>(transposeA ? k : m);
-    const libxsmm_blasint stride_B = static_cast<libxsmm_blasint>(transposeB ? n : k);
-    const libxsmm_blasint stride_C = static_cast<libxsmm_blasint>(m);
-
-    const libxsmm_blasint stride_blockA = static_cast<libxsmm_blasint>(mc);
-    // Use bigger stride to avoid hitting same cache line too often.
-    // This consistently gives +~0.5 Gflops.
-    const libxsmm_blasint stride_panelB = static_cast<libxsmm_blasint>(
-        kc % 32 == 0 ? kc + 16 : kc
-    );
-
-    // Kernel for the general case (not edges)
-    internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar> kernel;
-
-    LhsScalar* blockA = NULL;
-    RhsScalar* panelB = NULL;
-
-    if (copyA) {
-      blockA = static_cast<LhsScalar*>(this->m_device.allocate(mc * kc * sizeof(LhsScalar)));
-    }
-    if (copyB) {
-      panelB = static_cast<RhsScalar*>(this->m_device.allocate(nc_outer * stride_panelB * sizeof(RhsScalar)));
-    }
-
-    const Index kernel_stride_A = copyA ? stride_blockA : stride_A;
-    const Index kernel_stride_B = copyB ? stride_panelB : stride_B;
-    kernel = internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(0, mc, nc, kc, kernel_stride_A, kernel_stride_B, stride_C, 1, 1, blocking.prefetch());
-
-    // Outer blocking
-    for (Index ki_outer = 0; ki_outer < k; ki_outer += kc_outer) {
-      for (Index mi_outer = 0; mi_outer < m; mi_outer += mc_outer) {
-        for (Index ni_outer = 0; ni_outer < n; ni_outer += nc_outer) {
-          using numext::mini;
-
-          Index actual_nc_outer = mini(ni_outer+nc_outer, n) - ni_outer;
-
-          // Inner blocking
-          for (Index ki = ki_outer; ki < mini(ki_outer+kc_outer, k); ki += kc) {
-            const Index actual_kc = mini(ki_outer+kc_outer, mini(ki+kc, k)) - ki;
-            const float beta = ki == 0 ? 0 : 1;
-
-            if (copyB) {
-              if (transposeB) {
-                libxsmm_otrans(panelB, rightData + ki*stride_B + ni_outer, sizeof(RhsScalar), actual_nc_outer, actual_kc, stride_B, stride_panelB);
-              } else {
-                internal::pack_simple<RhsScalar, Index>(panelB, rightData + ni_outer*stride_B + ki, actual_nc_outer, actual_kc, stride_panelB, stride_B);
-              }
-            }
-
-            for (Index mi = mi_outer; mi < mini(mi_outer+mc_outer, m); mi += mc) {
-              const Index actual_mc = mini(mi_outer+mc_outer, mini(mi+mc, m)) - mi;
-
-              const LhsScalar* a = transposeA ? leftData + mi*stride_A + ki :
-                                                leftData + ki*stride_A + mi;
-
-              if (copyA) {
-                if (transposeA) {
-                  libxsmm_otrans(blockA, a, sizeof(LhsScalar), actual_kc, actual_mc, stride_A, stride_blockA);
-                } else {
-                  internal::pack_simple<LhsScalar, Index>(blockA, a, actual_kc, actual_mc, stride_blockA, stride_A);
-                }
-              }
-              const LhsScalar* actual_a = copyA ? blockA : a;
-
-              for (Index ni = ni_outer; ni < mini(ni_outer+nc_outer, n); ni += nc) {
-                const Index actual_nc = mini(ni_outer+nc_outer, mini(ni+nc, n)) - ni;
-
-                const RhsScalar* b = rightData + ni*stride_B + ki;
-                Scalar* c = buffer + ni*stride_C + mi;
-                const Scalar* cp = c + nc*stride_C;
-
-                const RhsScalar* actual_b = copyB ? panelB + (ni-ni_outer)*stride_panelB : b;
-                const RhsScalar* bp = copyB ? panelB + nc*stride_panelB : b + nc*stride_B;
-
-                if (actual_mc == mc && actual_kc == kc && actual_nc == nc && beta == 1) {
-                  // Most used, cached kernel.
-                  kernel(actual_a, actual_b, c, actual_a, bp, cp);
-                } else {
-                  // Edges - use libxsmm kernel cache.
-                  internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(0, actual_mc, actual_nc, actual_kc, kernel_stride_A, kernel_stride_B, stride_C, 1, beta, blocking.prefetch())(actual_a, actual_b, c, actual_a, bp, cp);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-
-    if (copyA) {
-      this->m_device.deallocate(blockA);
-    }
-    if (copyB) {
-      this->m_device.deallocate(panelB);
-    }
-  }
-#endif
-
+  protected:
   // Prevent assignment
   TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&);
   Dimensions m_dimensions;
@@ -844,7 +564,6 @@ protected:
   TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
   const Device& m_device;
   Scalar* m_result;
-  bool m_can_use_xsmm;
 };
 
 
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
index d34f9ca..5cf7b4f 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
@@ -50,140 +50,6 @@ class TensorContractionBlocking {
 };
 
 
-
-#if defined(EIGEN_USE_LIBXSMM)
-template <typename LhsScalar, typename RhsScalar, typename Index>
-class TensorXsmmContractionBlocking {
- public:
-  TensorXsmmContractionBlocking(Index k, Index m, Index n,
-      size_t max_num_threads = 1, bool transposeA = false,
-      bool transposeB = false):
-    k_(k), m_(m), n_(n), transposeA_(transposeA),
-    transposeB_(transposeB), num_threads_(max_num_threads) {
-#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
-    if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
-      mc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M;
-      kc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K;
-      nc_ = EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N;
-      outer_m_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_M;
-      outer_k_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_K;
-      outer_n_ = EIGEN_TEST_SPECIFIC_OUTER_BLOCKING_SIZE_N;
-      copyA_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_A;
-      copyB_ = EIGEN_TEST_SPECIFIC_BLOCKING_COPY_B;
-      outer_m_ = outer_m_ != 0 ? outer_m_ : m;
-      outer_k_ = outer_k_ != 0 ? outer_k_ : k;
-      outer_n_ = outer_n_ != 0 ? outer_n_ : n;
-    }
-#else
-    // Defaults, possibly overriden per-platform.
-    copyA_ = true;
-    copyB_ = false;
-
-    // If the matrix is small enough, don't do blocking, just call single xsmm
-    // kernel.
-    if (static_cast<double>(m)*k*n <= LIBXSMM_THRESHOLD) {
-      mc_ = m; kc_ = k; nc_ = n;
-      outer_m_ = m; outer_k_ = k; outer_n_ = n;
-      copyA_ = false; copyB_ = false;
-    } else {
-      int arch = libxsmm_cpuid_x86();
-
-      if (arch == LIBXSMM_X86_AVX512_CORE) {
-        // skylake
-        mc_ = 64; kc_ = 64; nc_ = 24;
-        outer_m_ = 512; outer_k_ = 512; outer_n_ = 24*22;
-        // Hack to use this kernel architecture as the other one has performance
-        // issues (no hardware prefetching).
-        // TODO(nishantpatil): This should be removed if the issues are fixed,
-        // or this one becomes the default.
-        setenv("LIBXSMM_AVX512_CLASSIC_GEMM", "1", 1);
-      } else if (arch == LIBXSMM_X86_AVX2) {
-        // haswell
-        mc_ = 32; kc_ = 192; nc_ = 33;
-        outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 33*16;
-      } else if (arch == LIBXSMM_X86_AVX) {
-        // ivybridge
-        mc_ = 32; kc_ = 192; nc_ = 48;
-        outer_m_ = 512; outer_k_ = 3*192; outer_n_ = 48*11;
-      } else {
-        // generic kernel size, usually performing well
-        mc_ = 32; kc_ = 128; nc_ = 32;
-        outer_m_ = 512; outer_k_ = 512; outer_n_ = 512;
-      }
-
-      // Only copy if it makes the stride smaller.
-      copyA_ = copyA_ && (m > mc_);
-      copyB_ = copyB_ && (k > kc_);
-    }
-
-    // We need to copy anyway if transposing
-    copyA_ = copyA_ || transposeA;
-    copyB_ = copyB_ || transposeB;
-
-    // See libxsmm_gemm_prefetch_type definition in libxsmm_typedefs.h
-    prefetch_ = LIBXSMM_PREFETCH_AL2CL2BL2_VIA_C;
-
-#endif
-
-    mc_ = mc_ > m ? m : mc_;
-    nc_ = nc_ > n ? n : nc_;
-    kc_ = kc_ > k ? k : kc_;
-
-    size_t compute_parallelism = (m / mc_) * (n / nc_);
-    size_t pack_parallelism = 0;
-    if (copyA_) {
-      pack_parallelism += (m / mc_) * (k / kc_);
-    }
-    if (copyB_) {
-      pack_parallelism += (n / nc_) * (k / kc_);
-    }
-    size_t parallelism = numext::maxi(compute_parallelism, pack_parallelism);
-
-    num_threads_ = numext::mini<size_t>(num_threads_,
-                                    parallelism / MIN_JOBS_PER_THREAD);
-    num_threads_ = numext::maxi<size_t>(num_threads_, 1);
-
-    // For optimal performance outer block sizes should be multiplies of kernel
-    // sizes, or bigger than matrix size (=no outer blocking).
-    eigen_assert(outer_m_ % mc_ == 0 || outer_m_ >= m);
-    eigen_assert(outer_k_ % kc_ == 0 || outer_k_ >= k);
-    eigen_assert(outer_n_ % nc_ == 0 || outer_n_ >= n);
-  }
-
-  EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
-  EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
-  EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
-  EIGEN_ALWAYS_INLINE Index outer_k() const { return outer_k_; }
-  EIGEN_ALWAYS_INLINE Index outer_m() const { return outer_m_; }
-  EIGEN_ALWAYS_INLINE Index outer_n() const { return outer_n_; }
-  EIGEN_ALWAYS_INLINE bool copyA() const { return copyA_; }
-  EIGEN_ALWAYS_INLINE bool copyB() const { return copyB_; }
-  EIGEN_ALWAYS_INLINE bool transposeA() const { return transposeA_; }
-  EIGEN_ALWAYS_INLINE bool transposeB() const { return transposeB_; }
-  EIGEN_ALWAYS_INLINE int num_threads() const { return num_threads_; }
-  EIGEN_ALWAYS_INLINE Index blocks_m() const { return divup(m_, mc_); }
-  EIGEN_ALWAYS_INLINE Index blocks_k() const { return divup(k_, kc_); }
-  EIGEN_ALWAYS_INLINE Index blocks_n() const { return divup(n_, nc_); }
-  EIGEN_ALWAYS_INLINE libxsmm_gemm_prefetch_type prefetch() const {
-    return prefetch_;
-  }
-
- private:
-  Index k_, m_, n_;
-  Index kc_, mc_, nc_;
-  Index outer_k_, outer_m_, outer_n_;
-  bool copyA_, copyB_, transposeA_, transposeB_;
-  size_t num_threads_;
-
-  // Threshold for m*k*n to skip blocking and just call libxsmm
-  const double LIBXSMM_THRESHOLD = 80*80*80;
-  // For computing optimal number of threads - so that each thread gets at least
-  // that many jobs.
-  const double MIN_JOBS_PER_THREAD = 3;
-  libxsmm_gemm_prefetch_type prefetch_;
-};
-#endif // EIGEN_USE_LIBXSMM
-
 } // end namespace internal
 } // end namespace Eigen
 
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index c04b784..d65dbb4 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -529,6 +529,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
                        float2 rhs_shmem2[][8], const Index m_size,
                        const Index n_size, const Index k_size,
                        const Index base_m, const Index base_n) {
+  typedef float Scalar;
 
   // prefetch registers
   float4 lhs_pf0, rhs_pf0;
@@ -539,27 +540,27 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
   }
 
 
-#define prefetch_lhs(reg, row, col)                            \
-    if (!CHECK_LHS_BOUNDARY) {                                 \
-      if (col < k_size) {                                      \
-        reg =lhs.template loadPacket<Unaligned>(row, col);     \
-      }                                                        \
-    } else {                                                   \
-      if (col < k_size) {                                      \
-        if (row + 3 < m_size) {                                \
-          reg =lhs.template loadPacket<Unaligned>(row, col);   \
-        } else if (row + 2 < m_size) {                         \
-          reg.x =lhs(row + 0, col);                            \
-          reg.y =lhs(row + 1, col);                            \
-          reg.z =lhs(row + 2, col);                            \
-        } else if (row + 1 < m_size) {                         \
-          reg.x =lhs(row + 0, col);                            \
-          reg.y =lhs(row + 1, col);                            \
-        } else if (row  < m_size) {                            \
-          reg.x =lhs(row + 0, col);                            \
-        }                                                      \
-      }                                                        \
-    }                                                          \
+#define prefetch_lhs(reg, row, col)                   \
+    if (!CHECK_LHS_BOUNDARY) {                        \
+      if (col < k_size) {                             \
+        reg =lhs.loadPacket<Unaligned>(row, col);     \
+      }                                               \
+    } else {                                          \
+      if (col < k_size) {                             \
+        if (row + 3 < m_size) {                       \
+          reg =lhs.loadPacket<Unaligned>(row, col);   \
+        } else if (row + 2 < m_size) {                \
+          reg.x =lhs(row + 0, col);                   \
+          reg.y =lhs(row + 1, col);                   \
+          reg.z =lhs(row + 2, col);                   \
+        } else if (row + 1 < m_size) {                \
+          reg.x =lhs(row + 0, col);                   \
+          reg.y =lhs(row + 1, col);                   \
+        } else if (row  < m_size) {                   \
+          reg.x =lhs(row + 0, col);                   \
+        }                                             \
+      }                                               \
+    }                                                 \
 
 
   Index lhs_vert = base_m+threadIdx.x*4;
@@ -577,7 +578,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
     if (!CHECK_RHS_BOUNDARY) {
       if ((rhs_vert + 3) < k_size) {
         // just CHECK_RHS_BOUNDARY
-        rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+        rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
       } else if (rhs_vert + 2 < k_size) {
         // just CHECK_RHS_BOUNDARY
         rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -592,7 +593,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
     } else {
       if (rhs_horiz0 < n_size) {
         if ((rhs_vert + 3) < k_size) {
-          rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+          rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
         } else if ((rhs_vert + 2) < k_size) {
           rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
           rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
@@ -765,6 +766,7 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
                        float2 rhs_shmem2[][8], const Index m_size,
                        const Index n_size, const Index k_size,
                        const Index base_m, const Index base_n) {
+  typedef float Scalar;
 
   // prefetch registers
   float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
@@ -788,37 +790,37 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
 
      if (!CHECK_LHS_BOUNDARY) {
       if ((threadIdx.y/4+k+24) < k_size) {
-        lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-        lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
-        lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
+        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+        lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+        lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
       } else if ((threadIdx.y/4+k+16) < k_size) {
-        lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-        lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+        lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
       } else if ((threadIdx.y/4+k+8) < k_size) {
-        lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
       } else if ((threadIdx.y/4+k) < k_size) {
-        lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
       }
     } else {
       // just CHECK_LHS_BOUNDARY
       if (lhs_vert + 3 < m_size) {
         if ((threadIdx.y/4+k+24) < k_size) {
-          lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-          lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
-          lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
+          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+          lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+          lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
         } else if ((threadIdx.y/4+k+16) < k_size) {
-          lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-          lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+          lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
         } else if ((threadIdx.y/4+k+8) < k_size) {
-          lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
         } else if ((threadIdx.y/4+k) < k_size) {
-          lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
         }
       } else if (lhs_vert + 2 < m_size) {
         if ((threadIdx.y/4+k+24) < k_size) {
@@ -907,8 +909,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
     if (!CHECK_RHS_BOUNDARY) {
       if ((rhs_vert + 3) < k_size) {
         // just CHECK_RHS_BOUNDARY
-        rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
-        rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
+        rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+        rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
       } else if (rhs_vert + 2 < k_size) {
         // just CHECK_RHS_BOUNDARY
         rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -930,8 +932,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
       if (rhs_horiz1 < n_size) {
         if ((rhs_vert + 3) < k_size) {
           // just CHECK_RHS_BOUNDARY
-          rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
-          rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
+          rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+          rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
         } else if (rhs_vert + 2 < k_size) {
           // just CHECK_RHS_BOUNDARY
           rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -952,7 +954,7 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
       } else if (rhs_horiz0 < n_size) {
         if ((rhs_vert + 3) < k_size) {
           // just CHECK_RHS_BOUNDARY
-          rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+          rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
         } else if ((rhs_vert + 2) < k_size) {
           // just CHECK_RHS_BOUNDARY
           rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@@ -1135,6 +1137,9 @@ EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
   typedef float2 LHS_MEM[64][32];
   typedef float2 RHS_MEM[128][8];
 
+  typedef float2 LHS_MEM16x16[32][16];
+  typedef float2 RHS_MEM16x16[64][8];
+
   const Index m_block_idx = blockIdx.x;
   const Index n_block_idx = blockIdx.y;
 
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
index ab320a5..9b2cb3f 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@@ -22,14 +22,8 @@ enum {
 /*
  * Implementation of the Eigen blas_data_mapper class for tensors.
  */
-/// The make pointer class is used by sycl in order to build the mapper class on the device. For other platform the default make pointer is used which
-/// is scalar * for CoeffLoader.
-template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_ = MakePointer> struct CoeffLoader;
-template<typename Scalar, typename Index, int side, typename Tensor, typename nocontract_t, typename contract_t,
-         int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
-         template <class> class MakePointer_ = MakePointer> class BaseTensorContractionMapper;
-
-template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_> struct CoeffLoader {
+
+template <typename Tensor, bool HasRawAccess> struct CoeffLoader {
   enum {
     DirectOffsets = false
   };
@@ -53,7 +47,7 @@ template <typename Tensor, bool HasRawAccess, template <class> class MakePointer
   const Tensor m_tensor;
 };
 
-template <typename Tensor, template <class> class MakePointer_> struct CoeffLoader<Tensor, true, MakePointer_> {
+template <typename Tensor> struct CoeffLoader<Tensor, true> {
   enum {
     DirectOffsets = true
   };
@@ -73,14 +67,13 @@ template <typename Tensor, template <class> class MakePointer_> struct CoeffLoad
   }
  private:
   typedef typename Tensor::Scalar Scalar;
-
-  typename MakePointer_<const Scalar>::Type m_data;
+  const Scalar* m_data;
 };
 
 template<typename Scalar, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
-         int packet_size, bool inner_dim_contiguous, int Alignment, template <class> class MakePointer_ = MakePointer>
+         int packet_size, bool inner_dim_contiguous, int Alignment>
 class SimpleTensorContractionMapper {
   public:
   EIGEN_DEVICE_FUNC
@@ -96,7 +89,7 @@ class SimpleTensorContractionMapper {
       m_k_strides(k_strides) { }
 
   enum {
-    DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>::DirectOffsets
+    DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess>::DirectOffsets
   };
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) {
@@ -213,22 +206,23 @@ class SimpleTensorContractionMapper {
   }
 
  protected:
-  CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_> m_tensor;
+  CoeffLoader<Tensor, Tensor::RawAccess> m_tensor;
   const nocontract_t m_nocontract_strides;
   const nocontract_t m_ij_strides;
   const contract_t m_contract_strides;
   const contract_t m_k_strides;
 };
 
+
 template<typename Scalar, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
          int packet_size, bool inner_dim_contiguous,
-         bool inner_dim_reordered, int Alignment, template <class> class MakePointer_>
-class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment, MakePointer_>
+         bool inner_dim_reordered, int Alignment>
+class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment>
 {
  public:
-  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment, MakePointer_> ParentMapper;
+  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment> ParentMapper;
 
   EIGEN_DEVICE_FUNC
   BaseTensorContractionMapper(const Tensor& tensor,
@@ -241,9 +235,9 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar,
   typedef typename Tensor::PacketReturnType Packet;
   typedef typename unpacket_traits<Packet>::half HalfPacket;
 
-  template <typename PacketT,int AlignmentType>
+  template <int AlignmentType>
   EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const {
+  EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
     // whole method makes column major assumption
 
     // don't need to add offsets for now (because operator handles that)
@@ -281,13 +275,7 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar,
     }
     data[packet_size - 1] = this->m_tensor.coeff(last);
 
-    return pload<PacketT>(data);
-  }
-
-  template <int AlignmentType>
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
-    return this->load<Packet,AlignmentType>(i,j);
+    return pload<Packet>(data);
   }
 
   template <int AlignmentType>
@@ -313,11 +301,11 @@ template<typename Scalar, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
          bool inner_dim_contiguous,
-         bool inner_dim_reordered, int Alignment, template <class> class MakePointer_>
-class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_>
+         bool inner_dim_reordered, int Alignment>
+class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment>
 {
  public:
-  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_> ParentMapper;
+  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment> ParentMapper;
 
   EIGEN_DEVICE_FUNC
   BaseTensorContractionMapper(const Tensor& tensor,
@@ -334,12 +322,6 @@ class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, con
     data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
     return pload<typename Tensor::PacketReturnType>(data);
   }
-  template <typename PacketT,int> EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const {
-    EIGEN_ALIGN_MAX Scalar data[1];
-    data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
-    return pload<PacketT>(data);
-  }
   template <int> EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const {
     return loadPacket(i, j);
@@ -351,14 +333,14 @@ template<typename Scalar, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
          int packet_size,
-         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, template <class> class MakePointer_=MakePointer>
+         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
 class TensorContractionSubMapper {
  public:
   typedef typename Tensor::PacketReturnType Packet;
   typedef typename unpacket_traits<Packet>::half HalfPacket;
 
-  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> ParentMapper;
-  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Self;
+  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
   typedef Self LinearMapper;
 
   enum {
@@ -403,14 +385,6 @@ class TensorContractionSubMapper {
     return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, j + m_horiz_offset);
   }
 
-  template <typename PacketT, int AlignmentType>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const {
-    if (UseDirectOffsets) {
-      return m_base_mapper.template load<PacketT,AlignmentType>(i, j);
-    }
-    return m_base_mapper.template loadPacket<PacketT,AlignmentType>(i + m_vert_offset, j + m_horiz_offset);
-  }
-
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
     if (UseDirectOffsets) {
       return m_base_mapper.template loadHalfPacket<Alignment>(i, 0);
@@ -418,7 +392,7 @@ class TensorContractionSubMapper {
     return m_base_mapper.template loadHalfPacket<Alignment>(i + m_vert_offset, m_horiz_offset);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet& p) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
     if (UseDirectOffsets) {
       m_base_mapper.storePacket(i, 0, p);
     }
@@ -458,14 +432,14 @@ template<typename Scalar_, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
          int packet_size,
-         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,  template <class> class MakePointer_=MakePointer>
+         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
 class TensorContractionInputMapper
-  : public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> {
+  : public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> {
 
  public:
   typedef Scalar_ Scalar;
-  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Base;
-  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> SubMapper;
+  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Base;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
   typedef SubMapper VectorMapper;
 
   EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor,
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
deleted file mode 100644
index e87de0c..0000000
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ /dev/null
@@ -1,400 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorSyclConvertToDeviceExpression.h
- *
- * \brief:
- *  TensorContractionsycl
- *
-*****************************************************************/
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
-namespace Eigen {
-
-template <typename Index, typename LhsScalar, typename RhsScalar,bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels;
-template<typename Indices, typename LeftArgType, typename RightArgType>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, const Eigen::SyclDevice> :
-    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, const Eigen::SyclDevice> > {
-
-  typedef const Eigen::SyclDevice Device;
-
-  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
-  typedef TensorContractionEvaluatorBase<Self> Base;
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
-  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
-  typedef typename XprType::Index Index;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
-  enum {
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-  };
-
-  // Most of the code is assuming that both input tensors are ColMajor. If the
-  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
-  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
-  // will pretend B is LHS and A is RHS.
-  typedef typename internal::conditional<
-    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
-  typedef typename internal::conditional<
-    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
-
-  static const int LDims =
-      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
-  static const int RDims =
-      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
-  static const int ContractDims = internal::array_size<Indices>::value;
-
-  typedef array<Index, LDims> left_dim_mapper_t;
-  typedef array<Index, RDims> right_dim_mapper_t;
-
-  typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, LDims - ContractDims> left_nocontract_t;
-  typedef array<Index, RDims - ContractDims> right_nocontract_t;
-
-  static const int NumDims = LDims + RDims - 2 * ContractDims;
-
-  typedef DSizes<Index, NumDims> Dimensions;
-
-  // typedefs needed in evalTo
-  typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
-  typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-
-  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
-  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
-
-  typedef typename LeftEvaluator::Dimensions LeftDimensions;
-  typedef typename RightEvaluator::Dimensions RightDimensions;
-
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
-      Base(op, device) {}
-
-  // We need to redefine this method to make nvcc happy
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
-    this->m_leftImpl.evalSubExprsIfNeeded(NULL);
-    this->m_rightImpl.evalSubExprsIfNeeded(NULL);
-    if (data) {
-      evalTo(data);
-      return false;
-    } else {
-      this->m_result = static_cast<Scalar*>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
-      evalTo(this->m_result);
-      return true;
-    }
-  }
-  const Eigen::SyclDevice& device() const {return this->m_device;}
-  void evalTo(Scalar* buffer) const {
-    // Here is the result
-    if (this->m_lhs_inner_dim_contiguous) {
-      if (this->m_rhs_inner_dim_contiguous) {
-        if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<true, true, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<true, true, false, Unaligned>(buffer);
-        }
-      }
-      else {
-       if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<true, false, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<true, false, false, Unaligned>(buffer);
-        }
-      }
-    }
-    else {
-      if (this->m_rhs_inner_dim_contiguous) {
-        if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<false, true, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<false, true, false, Unaligned>(buffer);
-        }
-      }
-      else {
-       if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<false, false, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<false, false, false, Unaligned>(buffer);
-        }
-      }
-    }
-  }
-
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  void evalTyped(Scalar* buffer) const {
-    // columns in left side, rows in right side
-    const Index k = this->m_k_size;
-    EIGEN_UNUSED_VARIABLE(k)
-    // rows in left side
-    const Index m = this->m_i_size;
-    // columns in right side
-    const Index n = this->m_j_size;
-
-    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
-    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
-    LaunchSyclKernels<Index, LhsScalar, RhsScalar,lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>::Run(*this, buffer, m, n, k,
-                       this->m_k_strides, this->m_left_contracting_strides, this->m_right_contracting_strides,
-                       this->m_i_strides, this->m_j_strides, this->m_left_nocontract_strides, this->m_right_nocontract_strides);
-  }
-  // required by sycl to construct the expr on the device. Returns original left_impl
-  const TensorEvaluator<LeftArgType, Device>& left_impl() const {
-    return choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), this->m_leftImpl, this->m_rightImpl);
-  }
-  // required by sycl to construct the expr on the device. Returns original right_impl
-  const TensorEvaluator<RightArgType, Device>& right_impl() const {
-    return choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), this->m_rightImpl, this->m_leftImpl);
-  }
-};
-
-template <typename HostExpr, typename OutScalar, typename LhsScalar, typename RhsScalar,  typename LHSFunctorExpr,  typename RHSFunctorExpr, typename LhsLocalAcc, typename RhsLocalAcc, typename OutAccessor, typename Index, typename ContractT, typename LeftNocontractT,
-typename RightNocontractT, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered,
-typename HostExpr::Index TileSizeDimM, typename HostExpr::Index TileSizeDimN,typename HostExpr::Index TileSizeDimK, typename HostExpr::Index WorkLoadPerThreadM,typename HostExpr::Index WorkLoadPerThreadN,
-typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadSizeN, typename HostExpr::Index LoadPerThreadLhs, typename HostExpr::Index LoadPerThreadRhs, typename LHSTupleType, typename RHSTupleType, typename Device> struct KernelConstructor{
-  typedef typename Eigen::internal::traits<HostExpr>::_LhsNested LHSHostExpr;
-  typedef typename Eigen::internal::traits<HostExpr>::_RhsNested RHSHostExpr;
-  typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<LHSHostExpr>::Type LHSPlaceHolderExpr;
-  typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<RHSHostExpr>::Type RHSPlaceHolderExpr;
-  LHSFunctorExpr lhs_functors;
-  RHSFunctorExpr rhs_functors;
-  LhsLocalAcc localLhs;
-  RhsLocalAcc localRhs;
-  OutAccessor out_res;
-  Index roundUpK, M, N, K;
-  ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides;
-  LeftNocontractT m_i_strides, m_left_nocontract_strides;
-  RightNocontractT m_j_strides,  m_right_nocontract_strides;
-  LHSTupleType left_tuple_of_accessors;
-  RHSTupleType right_tuple_of_accessors;
-  Device dev;
-
-
-  KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_,
-    Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_,
-    ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_,
-    LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, LHSTupleType left_tuple_of_accessors_, RHSTupleType right_tuple_of_accessors_, Device dev_)
-    :lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_),
-    m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_),
-    m_right_contracting_strides(m_right_contracting_strides_),
-    m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_),
-    m_j_strides(m_j_strides_),  m_right_nocontract_strides(m_right_nocontract_strides_),
-    left_tuple_of_accessors(left_tuple_of_accessors_), right_tuple_of_accessors(right_tuple_of_accessors_), dev(dev_){}
-
-    void operator()(cl::sycl::nd_item<1> itemID) {
-      typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
-      typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<LHSHostExpr>::Type LHSDevExpr;
-      typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<RHSHostExpr>::Type RHSDevExpr;
-      auto lhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression<LHSDevExpr, LHSPlaceHolderExpr>(lhs_functors, left_tuple_of_accessors);
-      auto rhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression<RHSDevExpr, RHSPlaceHolderExpr>(rhs_functors, right_tuple_of_accessors);
-      typedef decltype(lhs_dev_expr.expr) LeftArgType;
-      typedef decltype(rhs_dev_expr.expr) RightArgType;
-      typedef typename internal::conditional<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
-      typedef typename internal::conditional<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
-      typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
-      typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
-      typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
-                                                      LeftEvaluator, LeftNocontractT,
-                                                     ContractT, 1,
-                                                     lhs_inner_dim_contiguous,
-                                                     false, Unaligned, MakeGlobalPointer> LhsMapper;
-
-      typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
-                                                     RightEvaluator, RightNocontractT,
-                                                     ContractT, 1,
-                                                     rhs_inner_dim_contiguous,
-                                                     rhs_inner_dim_reordered, Unaligned, MakeGlobalPointer> RhsMapper;
-      // initialize data mappers must happen inside the kernel for device eval
-      LhsMapper lhs(LeftEvaluator(choose(Cond<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor)>(),
-                    lhs_dev_expr.expr, rhs_dev_expr.expr), dev), m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides);
-      RhsMapper rhs(RightEvaluator(choose(Cond<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor)>(),
-                    rhs_dev_expr.expr, lhs_dev_expr.expr),dev), m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides);
-      auto out_ptr = ConvertToActualTypeSycl(OutScalar, out_res);
-      // Matmul Kernel
-      // Thread identifiers
-      const Index mLocalThreadId = itemID.get_local(0); // Local ID row
-      const Index nLocalThreadId = itemID.get_local(1); // Local ID col
-      const Index mGroupId = itemID.get_group(0); // Work-group ID row
-      const Index nGroupId = itemID.get_group(1); // Work-group ID localCol
-      const Index linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID
-      // Allocate register space
-      float privateLhs;
-      float privateRhs[WorkLoadPerThreadN];
-      float privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN];
-      // Initialise the privateResumulation registers
-      for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
-          for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
-              privateRes[wLPTM][wLPTN] = 0.0f;
-          }
-      }
-
-      // Tile Lhs
-      for (Index lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
-          Index localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
-          Index localLhsRow =  localLhsLinearId% TileSizeDimM;
-          Index localLhsCol = localLhsLinearId/TileSizeDimM;
-          // Load the value (wide vector load)
-          Index GlobalLhsColId = TileSizeDimK*0 + localLhsCol;
-          localLhs[0 + ((localLhsCol*TileSizeDimM + localLhsRow)*2)] =((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId):static_cast<OutScalar>(0);
-      }
-      // Tile Rhs
-      for (Index lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
-          Index localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
-          Index localRhsRow =  localRhsLinearId% TileSizeDimN;
-          Index localRhsCol = localRhsLinearId/TileSizeDimN;
-          // Load the value (wide vector load)
-          Index GlobalRhsRowId = TileSizeDimK*0 + localRhsCol;
-          localRhs[0 + ((localRhsCol*TileSizeDimN + localRhsRow) *2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow): static_cast<OutScalar>(0);
-
-      }
-      // Loop over all tiles
-      const Index numTiles = roundUpK/TileSizeDimK;
-      Index firstHalf=0;
-      do {
-          // Synchronise
-          itemID.barrier(cl::sycl::access::fence_space::local_space);
-          // Load the next tile of Lhs and Rhs into local memory
-          Index nextHalf = firstHalf + 1;
-          if (nextHalf < numTiles) {
-              // Tile A
-              for (Index lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
-                  Index localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
-                  Index localLhsRow =  localLhsLinearId% TileSizeDimM;
-                  Index localLhsCol = localLhsLinearId/TileSizeDimM;
-                  // global K id
-                  Index GlobalLhsColId = TileSizeDimK*nextHalf + localLhsCol;
-                  // Store the loaded value into local memory
-                  localLhs[(nextHalf%2) + ((localLhsCol*TileSizeDimM + localLhsRow) *2)] = ((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId): static_cast<OutScalar>(0);
-              }
-              // Tile B
-              for (Index lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
-                  Index localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
-                  Index localRhsRow =  localRhsLinearId% TileSizeDimN;
-                  Index localRhsCol = localRhsLinearId/TileSizeDimN;
-                  // Load the value (wide vector load)
-                  Index GlobalRhsRowId = TileSizeDimK*nextHalf + localRhsCol;
-                  // Store the loaded vector into local memory
-                  localRhs[(nextHalf%2) +((localRhsCol*TileSizeDimN + localRhsRow)*2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow):static_cast<OutScalar>(0);
-              }
-          }
-          // Loop over the values of a single tile
-          for (Index k=0; k<TileSizeDimK; k++) {
-              // Cache the values of localRhs in registers
-              for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
-                  Index localRhsCol = nLocalThreadId + wLPTN*LocalThreadSizeN;
-                  privateRhs[wLPTN] = localRhs[(firstHalf%2) +((k*TileSizeDimN + localRhsCol)*2)];
-              }
-              // Perform the computation
-              for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
-                  Index localLhsRow = mLocalThreadId + wLPTM*LocalThreadSizeM;
-                  privateLhs = localLhs[(firstHalf%2)+ ((k*TileSizeDimM + localLhsRow)*2)];
-                  for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
-                      privateRes[wLPTM][wLPTN] += privateLhs * privateRhs[wLPTN];
-                  }
-              }
-          }
-          // Next tile
-          firstHalf++;
-      } while (firstHalf<numTiles);
-
-      // Store the final results in C
-      for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
-          Index globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM;
-          if (globalRow< M){
-            for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
-                Index globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN;
-                if(globalCol<N)
-                  out_ptr[globalCol*M + globalRow] = privateRes[wLPTM][wLPTN];
-            }
-          }
-      }
-
-    }
-
-};
-template <typename Index, typename LhsScalar, typename RhsScalar, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels {
-
-static const Index TileSizeDimM = 32ul;                                      // Tile size for dimension M
-static const Index TileSizeDimN = 32ul;                                      // Tile size for dimension N
-static const Index TileSizeDimK = 16ul;                                      // Tile size for dimension K
-static const Index WorkLoadPerThreadM = 4ul;                                 // Work load per thread in dimension M
-static const Index WorkLoadPerThreadN = 4ul;                                 // work load per thread in dimension N
-static const Index LocalThreadSizeM = (TileSizeDimM/WorkLoadPerThreadM);   // Local thread size for the first dimension (M here)
-static const Index LocalThreadSizeN = (TileSizeDimN/WorkLoadPerThreadN);   // Local thread size for the second dimension (N here)
-static const Index LoadPerThreadLhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimN));  // workload per thread for Lhs expression
-static const Index LoadPerThreadRhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimM));  // workload per thread for Rhs expression
-
-// RoundUp function to make sure that the global threadId is divisable by local threadId
-static Index RoundUp(Index x, Index y) {
-  return ((((x) + (y) - 1) / (y))*(y));
-}
-
-template< typename Self, typename OutScalar, typename ContractT, typename LeftNocontractT, typename RightNocontractT>
-  static void Run(const Self& self, OutScalar* buffer,  Index M, Index N, Index K,
-    ContractT m_k_strides, ContractT m_left_contracting_strides, ContractT m_right_contracting_strides,
-    LeftNocontractT m_i_strides, RightNocontractT m_j_strides, LeftNocontractT m_left_nocontract_strides, RightNocontractT m_right_nocontract_strides){
-
-    typedef typename Self::XprType HostExpr;
-    typedef typename Eigen::internal::traits<HostExpr>::_LhsNested LHSHostExpr;
-    typedef typename Eigen::internal::traits<HostExpr>::_RhsNested RHSHostExpr;
-    typedef TensorEvaluator<LHSHostExpr, const Eigen::SyclDevice> OrigLHSExpr;
-    typedef TensorEvaluator<RHSHostExpr, const Eigen::SyclDevice> OrigRHSExpr;
-    typedef Eigen::TensorSycl::internal::FunctorExtractor<OrigLHSExpr> LHSFunctorExpr;
-    typedef Eigen::TensorSycl::internal::FunctorExtractor<OrigRHSExpr> RHSFunctorExpr;
-    // extract lhs functor list
-    LHSFunctorExpr lhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl());
-    // extract rhs functor list
-    RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl());
-
-    Index roundUpK = RoundUp(K, TileSizeDimK);
-    Index roundUpM = RoundUp(M, TileSizeDimM);
-    Index roundUpN = RoundUp(N, TileSizeDimN);
-
-    self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) {
-      /// work-around for gcc bug
-      typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl())) LHSTupleType;
-      /// work-around for gcc bug
-      typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigRHSExpr>(cgh, self.right_impl())) RHSTupleType;
-      // create lhs tuple of accessors
-      LHSTupleType left_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl());
-      // create rhs tuple of accessors
-      RHSTupleType right_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<OrigRHSExpr>(cgh, self.right_impl());
-
-      // Local memory for elements of Lhs
-      typedef cl::sycl::accessor<LhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> LhsLocalAcc;
-      LhsLocalAcc localLhs(cl::sycl::range<1>(2* TileSizeDimM * TileSizeDimK), cgh);
-      // Local memory for elements of Rhs
-      typedef cl::sycl::accessor<RhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> RhsLocalAcc;
-      RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh);
-
-      typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> OutAccessor;
-      //OutScalar memory
-      OutAccessor out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer);
-
-      // sycl parallel for
-      cgh.parallel_for(cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN),
-      cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)),
-       KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, LHSFunctorExpr, RHSFunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT,
-       RightNocontractT, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, TileSizeDimM, TileSizeDimN, TileSizeDimK,
-       WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::DefaultDevice>(lhs_functors, rhs_functors,
-          localLhs, localRhs, out_res, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides,
-          m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::DefaultDevice()));
-    });
-    self.device().asynchronousExec();
-  }
-};
-
-} // end namespace Eigen
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index d30cc96..ee16cde 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -116,28 +116,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
             bool rhs_inner_dim_reordered, int Alignment>
   void evalProduct(Scalar* buffer) const {
-    const Index m = this->m_i_size;
-    const Index n = this->m_j_size;
-    const Index k = this->m_k_size;
-    if (m == 0 || n == 0 || k == 0) return;
-
-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
-    if (this->m_can_use_xsmm) {
-      bool transposeA = !this->m_lhs_inner_dim_contiguous;
-      bool transposeB = !this->m_rhs_inner_dim_contiguous;
-      internal::TensorXsmmContractionBlocking<LhsScalar, RhsScalar, Index>
-          blocking(k, m, n, this->m_device.numThreads(), transposeA,
-                   transposeB);
-
-      if (blocking.num_threads() == 1) {
-        this->evalGemmXSMM(buffer);
-      } else {
-        ContextXsmm<Alignment>(this, buffer, m, n, k, blocking).run();
-      }
-      return;
-    }
-#endif
-
     typedef
         typename internal::remove_const<typename EvalLeftArgType::Scalar>::type
             LhsScalar;
@@ -169,7 +147,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
                                   Traits::mr, Traits::nr, false, false>
         GebpKernel;
 
-
+    const Index m = this->m_i_size;
+    const Index n = this->m_j_size;
+    const Index k = this->m_k_size;
+    if (m == 0 || n == 0 || k == 0) return;
 
     // Compute a set of algorithm parameters:
     // - kernel block sizes (bm, bn, bk)
@@ -1063,187 +1044,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       rhsCost.dropMemoryCost();
     return cost + lhsCost + rhsCost;
   }
-
-#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
-  template<int Alignment>
-  class ContextXsmm {
-   public:
-    ContextXsmm(const Self* self, Scalar* buffer, Index m, Index n, Index k,
-                const internal::TensorXsmmContractionBlocking<LhsScalar,
-                    RhsScalar, Index>& blocking):
-        device(self->m_device),
-        m(m), k(k), n(n),
-        stride_a(blocking.transposeA() ? k : m),
-        stride_b(blocking.transposeB() ? n : k),
-        stride_c(m),
-        bm(blocking.mc()), bk(blocking.kc()), bn(blocking.nc()),
-        blocks_m(blocking.blocks_m()), blocks_k(blocking.blocks_k()),
-        blocks_n(blocking.blocks_n()),
-        copyA(blocking.copyA()), copyB(blocking.copyB()),
-        transposeA(blocking.transposeA()), transposeB(blocking.transposeB()),
-        num_threads(blocking.num_threads()),
-        buffer(buffer),
-        leftData(self->m_leftImpl.data()), rightData(self->m_rightImpl.data()),
-        workers_done(blocking.num_threads()),
-
-        packingA_jobs(0), packingB_jobs(0), compute_jobs(0),
-        packingA_done(blocking.blocks_m()), packingB_done(blocking.blocks_n()) {}
-
-    void worker() {
-      // Pack
-
-      if (copyA) {
-        while (true) {
-          uint32_t mk = packingA_jobs++;
-          Index mi = mk / blocks_k;
-          Index ki = mk % blocks_k;
-          if (mi >= blocks_m) break;
-
-          LhsScalar * blockA = blocksA + (bk*bm) * (mi*blocks_k+ki);
-          if (transposeA) {
-            const LhsScalar * current_a = leftData + (bm*mi)*stride_a + (bk*ki);
-            libxsmm_otrans(blockA, current_a, sizeof(LhsScalar), actual_bk(ki),
-                           actual_bm(mi), stride_a, bm);
-          } else {
-            const LhsScalar * current_a = leftData + (bk*ki)*stride_a + (bm*mi);
-            internal::pack_simple<LhsScalar, Index>(blockA, current_a,
-                actual_bk(ki), actual_bm(mi), bm, stride_a);
-          }
-          packingA_done.at(mi)++;
-        }
-      }
-
-      if (copyB) {
-        while (true) {
-          uint32_t nk = packingB_jobs++;
-          Index ni = nk / blocks_k;
-          Index ki = nk % blocks_k;
-          if (ni >= blocks_n) break;
-
-          RhsScalar * blockB = blocksB + (bk*bn) * (ni*blocks_k+ki);
-          if (transposeB) {
-            const RhsScalar * current_b = rightData + (ki*bk)*stride_b +
-                                          (ni*bn);
-            libxsmm_otrans(blockB, current_b, sizeof(RhsScalar), actual_bn(ni),
-                           actual_bk(ki), stride_b, bk);
-          } else {
-            const RhsScalar * current_b = rightData + (ni*bn)*stride_b +
-                                          (ki*bk);
-            internal::pack_simple<RhsScalar, Index>(blockB, current_b,
-                actual_bn(ni), actual_bk(ki), bk, stride_b);
-          }
-          packingB_done.at(ni)++;
-        }
-      }
-
-      // Compute
-
-      while (true) {
-        uint32_t mn = compute_jobs++;
-        Index mi = mn / blocks_n;
-        Index ni = mn % blocks_n;
-        if (mi >= blocks_m) break;
-
-        // Wait for mi, ni packings to be done. This is more fine-grained than
-        // waiting for all workers to finish packing.
-        while ((copyA && (packingA_done.at(mi) < blocks_k)) ||
-               (copyB && (packingB_done.at(ni) < blocks_k)))
-        {}
-
-        for (Index ki=0; ki < blocks_k; ++ki) {
-          const LhsScalar * current_a = copyA ?
-              blocksA + (bk*bm) * (mi*blocks_k+ki) :
-              leftData + (bk*ki)*stride_a + (bm*mi);
-          const RhsScalar * current_b = copyB ?
-              blocksB + (bk*bn) * (ni*blocks_k+ki) :
-              rightData + (ni*bn)*stride_b + (bk*ki);
-
-          Index current_stride_a = copyA ? bm : stride_a;
-          Index current_stride_b = copyB ? bk : stride_b;
-
-          // Memory may not be zeroed, overwrite instead of adding in first
-          // iteration.
-          float beta = ki == 0 ? 0 : 1;
-
-          Scalar * current_c = buffer + (mi*bm) + (ni*bn)*stride_c;
-          internal::libxsmm_wrapper<LhsScalar, RhsScalar, Scalar>(
-              0, actual_bm(mi), actual_bn(ni), actual_bk(ki),
-              current_stride_a, current_stride_b, stride_c, 1, beta, 0)
-          (current_a, current_b, current_c);
-        }
-      }
-
-      workers_done.Notify();
-    }
-
-    void run() {
-      // Parallelization strategy.
-      //
-      // First pack A into blocks (sharding by m, k) and B (sharding by n,k),
-      // then shard by m, n.
-      //
-      // Do not use advanced ThreadPool queuing, just run a single long-standing
-      // function in each thread.
-      if (copyA) {
-        blocksA = static_cast<LhsScalar*>(device.allocate(
-            (blocks_m*bm)*(blocks_k*bk)*sizeof(LhsScalar)));
-      }
-      if (copyB) {
-        blocksB = static_cast<RhsScalar*>(device.allocate(
-            (blocks_n*bn)*(blocks_k*bk)*sizeof(RhsScalar)));
-      }
-
-      for (Index i = 0; i < num_threads; ++i) {
-          device.enqueueNoNotification([=]() { worker(); });
-      }
-
-      workers_done.Wait();
-
-      if (copyA) {
-        device.deallocate(blocksA);
-      }
-      if (copyB) {
-        device.deallocate(blocksB);
-      }
-    }
-
-   private:
-    // real block size for block index in [0, ..., blocks - 1].
-    Index actual_bm(Index mi) const {
-      return mi != blocks_m - 1 ? bm : m + bm - bm * blocks_m;
-    }
-    Index actual_bk(Index ki) const {
-      return ki != blocks_k - 1 ? bk : k + bk - bk * blocks_k;
-    }
-    Index actual_bn(Index ni) const {
-      return ni != blocks_n - 1 ? bn : n + bn - bn * blocks_n;
-    }
-
-    const Device& device;
-    Index m, k, n;
-    Index stride_a, stride_b, stride_c;
-    Index bm, bk, bn;  // Block sizes.
-    Index blocks_m, blocks_k, blocks_n;  // Number of blocks in each dimension.
-    bool copyA, copyB, transposeA, transposeB;
-    Index num_threads;
-    Scalar *buffer;
-    const LhsScalar *leftData;
-    const RhsScalar *rightData;
-
-    LhsScalar *blocksA;
-    RhsScalar *blocksB;
-    // barrier for joining all threads after all done.
-    Barrier workers_done;
-    // "queues" of (mi,ki), (ki,ni), (mi,ni) jobs packed [0,p)x[0,q) -> [0, p*q)
-    std::atomic<uint32_t> packingA_jobs;
-    std::atomic<uint32_t> packingB_jobs;
-    std::atomic<uint32_t> compute_jobs;
-    // already packed blocks for each mi-panel in A and ni-panel in B.
-    std::vector<std::atomic<uint8_t>> packingA_done;
-    std::vector<std::atomic<uint8_t>> packingB_done;
-  };
-#endif
-
 };
 
 } // end namespace Eigen
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index b29968b..860a694 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -246,9 +246,6 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
-  /// required by sycl in order to extract the sycl accessor
-  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
-
   protected:
   template <int LoadMode, bool ActuallyVectorize>
   struct PacketConv {
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 378f5cc..abdf742 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -100,7 +100,7 @@ class IndexMapper {
       }
     } else {
       for (int i = NumDims - 1; i >= 0; --i) {
-        if (static_cast<size_t>(i + 1) < offset) {
+        if (i + 1 < offset) {
           m_cudaInputStrides[i] =
               m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1];
           m_cudaOutputStrides[i] =
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
deleted file mode 100644
index 4247c1c..0000000
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ /dev/null
@@ -1,476 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
-
-namespace Eigen {
-
-/** \class TensorConvolution
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor convolution class.
-  *
-  *
-  */
-template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index,
-typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType>
-struct EigenConvolutionKernel1D{
-typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper;
-Kernel_accessor kernel_filter;
-const size_t kernelSize, range_x, range_y;
-Buffer_accessor buffer_acc;
-Local_accessor local_acc;
-FunctorExpr functors;
-TupleType tuple_of_accessors;
-EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
-  Kernel_accessor kernel_filter_,  const size_t kernelSize_, const size_t range_x_, const size_t range_y_,
-  Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
-  :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize(kernelSize_), range_x(range_x_), range_y(range_y_),
-  buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
-
-  void operator()(cl::sycl::nd_item<2> itemID) {
-    typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
-    auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
-    auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
-
-    auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
-    auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
-
-    const size_t num_x_input =  (itemID.get_local_range()[0] +kernelSize -1); //the required row to be calculated for the for each plane in shered memory
-    const size_t plane_kernel_offset = itemID.get_local(1) * num_x_input;
-    const size_t first_input_start = itemID.get_group(0)*itemID.get_local_range()[0];
-    const size_t plane_tensor_offset =indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(1));
-    /// fill the shared memory
-    for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) {
-      const size_t local_index = i + plane_kernel_offset ;
-      const size_t tensor_index  =  plane_tensor_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_input_start);
-      if(((i + first_input_start) < (range_x +kernelSize-1)) && itemID.get_global(1)< range_y){
-        local_acc[local_index] = device_evaluator.coeff(tensor_index);
-      }
-      else local_acc[local_index]=0.0f;
-    }
-
-    itemID.barrier(cl::sycl::access::fence_space::local_space);
-
-    // calculate the convolution
-    const size_t first_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x
-    if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y){
-      CoeffReturnType result = static_cast<CoeffReturnType>(0);
-      const size_t index = plane_kernel_offset+ itemID.get_local(0);
-      for (size_t k = 0; k < kernelSize; ++k) {
-        result += (local_acc[k + index] * kernel_ptr[k]);
-      }
-      const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(1))
-      +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + first_output_start);
-      buffer_ptr[tensor_index] = result;
-    }
-  }
-};
-
-
-template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index,
-typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType>
-struct EigenConvolutionKernel2D{
-typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper;
-Kernel_accessor kernel_filter;
-const size_t kernelSize_x, kernelSize_y, range_x, range_y , range_z;
-Buffer_accessor buffer_acc;
-Local_accessor local_acc;
-FunctorExpr functors;
-TupleType tuple_of_accessors;
-EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
-  Kernel_accessor kernel_filter_,  const size_t kernelSize_x_, const size_t kernelSize_y_ ,const size_t range_x_, const size_t range_y_, const size_t range_z_,
-  Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
-  :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), range_x(range_x_), range_y(range_y_), range_z(range_z_),
-  buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
-
-  void operator()(cl::sycl::nd_item<3> itemID) {
-    typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
-    auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
-    auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
-
-    auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
-    auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
-    const size_t num_x_input =  (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory
-    const size_t num_y_input =  (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory
-    const size_t plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(2));
-    const size_t plane_kernel_offset = itemID.get_local(2) * num_y_input;
-
-    /// fill the shared memory
-    const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0];
-    const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1];
-    for (size_t j = itemID.get_local(1); j < num_y_input; j += itemID.get_local_range()[1]) {
-      const size_t local_input_offset = num_x_input * (j + plane_kernel_offset);
-      for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) {
-        const size_t local_index = i + local_input_offset;
-        const size_t tensor_index  = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_x_input_start, j+ first_y_input_start );
-        if(((i + first_x_input_start) < (range_x +kernelSize_x-1))  &&((j + first_y_input_start) < (range_y +kernelSize_y-1)) && itemID.get_global(2)< range_z){
-          local_acc[local_index] = device_evaluator.coeff(tensor_index);
-        }
-        else local_acc[local_index]=0.0f;
-    }
-  }
-
-    itemID.barrier(cl::sycl::access::fence_space::local_space);
-
-    // calculate the convolution
-    const size_t fitst_x_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x
-    const size_t fitst_y_output_start =itemID.get_group(1)*(itemID.get_local_range()[1]); // output start y
-    if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y && itemID.get_global(2)< range_z){
-      CoeffReturnType result = static_cast<CoeffReturnType>(0);
-      for (size_t j = 0; j < kernelSize_y; j++) {
-        size_t kernel_offset =kernelSize_x * j;
-        const size_t index = (num_x_input*(plane_kernel_offset + j+ itemID.get_local(1))) + itemID.get_local(0);
-        for (size_t i = 0; i < kernelSize_x; i++) {
-        result += (local_acc[i + index] * kernel_ptr[i+kernel_offset]);
-        }
-      }
-      const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(2))
-      +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start);
-      buffer_ptr[tensor_index] = result;
-    }
-  }
-};
-
-
-
-template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index,
-typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType>
-struct EigenConvolutionKernel3D{
-typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper;
-Kernel_accessor kernel_filter;
-const size_t kernelSize_x, kernelSize_y, kernelSize_z, range_x, range_y , range_z, numP;
-Buffer_accessor buffer_acc;
-Local_accessor local_acc;
-FunctorExpr functors;
-TupleType tuple_of_accessors;
-EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
-  Kernel_accessor kernel_filter_,  const size_t kernelSize_x_, const size_t kernelSize_y_ , const size_t kernelSize_z_ ,
-  const size_t range_x_, const size_t range_y_, const size_t range_z_, const size_t numP_,
-  Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
-  :indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_),
-  kernelSize_z(kernelSize_z_), range_x(range_x_), range_y(range_y_), range_z(range_z_), numP(numP_),
-  buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
-
-  void operator()(cl::sycl::nd_item<3> itemID) {
-    typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
-    auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
-    auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
-
-    auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
-    auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
-    const size_t num_x_input =  (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory
-    const size_t num_y_input =  (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory
-    const size_t num_z_input =  (itemID.get_local_range()[2] +kernelSize_z -1); //the required row to be calculated for the for each plane in shered memory
-    const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0];
-    const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1];
-    const size_t first_z_input_start = itemID.get_group(2)*itemID.get_local_range()[2];
-    for(size_t p=0; p<numP; p++){
-      /// fill the shared memory
-      const size_t plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
-      for (size_t k = itemID.get_local(2); k < num_z_input; k += itemID.get_local_range()[2]) {
-        for (size_t j = itemID.get_local(1); j < num_y_input; j += itemID.get_local_range()[1]) {
-          for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) {
-            const size_t local_index = i + (num_x_input * (j + (num_y_input * k)));
-            const size_t tensor_index  = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_x_input_start, j+ first_y_input_start , k+ first_z_input_start );
-            if(((i + first_x_input_start) < (range_x +kernelSize_x-1))  && ((j + first_y_input_start) < (range_y +kernelSize_y-1)) &&  ((k + first_z_input_start) < (range_z +kernelSize_z-1)) ){
-              local_acc[local_index] = device_evaluator.coeff(tensor_index);
-            }
-            else local_acc[local_index]=0.0f;
-          }
-        }
-      }
-      itemID.barrier(cl::sycl::access::fence_space::local_space);
-
-      // calculate the convolution
-      const size_t fitst_x_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // x
-      const size_t fitst_y_output_start =itemID.get_group(1)*(itemID.get_local_range()[1]); // y
-      const size_t fitst_z_output_start =itemID.get_group(2)*(itemID.get_local_range()[2]); // z
-
-      if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y && itemID.get_global(2)< range_z){
-        CoeffReturnType result = static_cast<CoeffReturnType>(0);
-        for (size_t k = 0; k < kernelSize_z; k++) {
-          for (size_t j = 0; j < kernelSize_y; j++) {
-            for (size_t i = 0; i < kernelSize_x; i++) {
-              const size_t kernel_index =i + kernelSize_x * (j + kernelSize_y * k);
-              const size_t local_index = ((i+ itemID.get_local(0))+  num_x_input*((j+ itemID.get_local(1)) + num_y_input * (k+ itemID.get_local(2))));
-              result += (local_acc[local_index] * kernel_ptr[kernel_index]);
-            }
-          }
-        }
-        const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p)
-        +indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start, itemID.get_local(2) + fitst_z_output_start );
-        buffer_ptr[tensor_index] = result;
-      }
-
-      itemID.barrier(cl::sycl::access::fence_space::local_space);
-    }
-  }
-};
-
-
-template<typename Indices, typename InputArgType, typename KernelArgType>
-struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, const Eigen::SyclDevice>
-{
-  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
-
-  static const int NumDims =  internal::array_size<typename TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Dimensions>::value;
-  static const int NumKernelDims = internal::array_size<Indices>::value;
-  typedef typename XprType::Index Index;
-  typedef DSizes<Index, NumDims> Dimensions;
-  typedef typename TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Dimensions KernelDimensions;
-  typedef const Eigen::SyclDevice Device;
-
-  enum {
-    IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned,
-    PacketAccess = false,
-    Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
-  };
-
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Eigen::SyclDevice& device)
-      : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
-  {
-    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-    const typename TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
-    const typename TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
-
-    m_dimensions = m_inputImpl.dimensions();
-    for (int i = 0; i < NumKernelDims; ++i) {
-      const Index index = op.indices()[i];
-      const Index input_dim = input_dims[index];
-      const Index kernel_dim = kernel_dims[i];
-      const Index result_dim = input_dim - kernel_dim + 1;
-      m_dimensions[index] = result_dim;
-    }
-  }
-
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, const Eigen::SyclDevice>::type PacketReturnType;
-  typedef typename InputArgType::Scalar Scalar;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
-
-  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
-    preloadKernel();
-    m_inputImpl.evalSubExprsIfNeeded(NULL);
-    if (data) {
-      executeEval(data);
-      return false;
-    } else {
-      m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
-      executeEval(m_buf);
-      return true;
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
-    m_inputImpl.cleanup();
-    if (m_buf) {
-      m_device.deallocate(m_buf);
-      m_buf = NULL;
-    }
-    if (m_local_kernel) {
-      m_device.deallocate((void*)m_kernel);
-      m_local_kernel = false;
-    }
-    m_kernel = NULL;
-  }
-  /// used by sycl in order to build the sycl buffer
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;}
-  /// used by sycl in order to build the sycl buffer
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return m_buf; }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
-    // Don't make a local copy of the kernel unless we have to (i.e. it's an
-    // expression that needs to be evaluated)
-    const Scalar* in_place = m_kernelImpl.data();
-    if (in_place) {
-      m_kernel = in_place;
-      m_local_kernel = false;
-    } else {
-      size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
-      Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
-      typedef TensorEvalToOp<const KernelArgType> EvalTo;
-      EvalTo evalToTmp(local, m_kernelArg);
-      const bool PacketAccess = internal::IsVectorizable<const Eigen::SyclDevice, KernelArgType>::value;
-      internal::TensorExecutor<const EvalTo, const Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
-      m_kernel = local;
-      m_local_kernel = true;
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE  void executeEval(Scalar* data) const {
-    typedef TensorEvaluator<InputArgType, const Eigen::SyclDevice> InputEvaluator;
-    typedef typename InputEvaluator::Dimensions InputDims;
-
-    typedef Eigen::TensorSycl::internal::FunctorExtractor<InputEvaluator> InputFunctorExpr;
-    // extract input functor list
-    InputFunctorExpr input_functors = Eigen::TensorSycl::internal::extractFunctors(m_inputImpl);
-
-
-    m_device.sycl_queue().submit([&](cl::sycl::handler &cgh) {
-
-      typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> InputLocalAcc;
-      /// work-around for gcc 4.8 auto bug
-      typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl)) InputTupleType;
-      // create input tuple of accessors
-      InputTupleType tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl);
-
-      typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> OutputAccessorType;
-      OutputAccessorType out_res= m_device. template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, data);
-      typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> KernelAccessorType;
-      KernelAccessorType kernel_acc= m_device. template get_sycl_accessor<cl::sycl::access::mode::read>(cgh, m_kernel);
-
-      switch (NumKernelDims) {
-        case 1: {
-          const size_t numX = dimensions()[m_indices[0]];
-          const size_t numP = dimensions().TotalSize() / numX;
-          const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
-          size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y;
-          m_device.parallel_for_setup(numX, numP, tileSize_x,tileSize_y,range_x,range_y, GRange_x, GRange_y );
-          const size_t shared_mem =(tileSize_x +kernel_size -1)*(tileSize_y);
-          assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
-          auto global_range=cl::sycl::range<2>(GRange_x, GRange_y);  // global range
-          auto local_range=cl::sycl::range<2>(tileSize_x, tileSize_y);  // local range
-          InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
-          const array<Index, 1> indices{{m_indices[0]}};
-          const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
-          internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
-          cgh.parallel_for(cl::sycl::nd_range<2>(global_range, local_range),
-          EigenConvolutionKernel1D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
-          InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
-          indexMapper,kernel_acc, kernel_size, numX, numP, out_res, local_acc, input_functors, tuple_of_accessors));
-          break;
-        }
-
-        case 2: {
-          const size_t idxX =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
-          const size_t idxY =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
-          const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX];
-          const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY];
-          const size_t numX = dimensions()[m_indices[idxX]];
-          const size_t numY = dimensions()[m_indices[idxY]];
-          const size_t numP = dimensions().TotalSize() / (numX*numY);
-          size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z;
-          m_device.parallel_for_setup(numX, numY, numP, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z );
-          const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * tileSize_z;
-          assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
-          auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z);  // global range
-          auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z);  // local range
-          InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
-          const array<Index, 2> indices {{m_indices[idxX], m_indices[idxY]}};
-          const array<Index, 2> kernel_dims{{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY]}};
-          internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
-          cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range),
-          EigenConvolutionKernel2D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
-          InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
-          indexMapper,kernel_acc, kernel_size_x,  kernel_size_y, numX, numY, numP, out_res, local_acc, input_functors, tuple_of_accessors));
-          break;
-        }
-
-        case 3: {
-          const size_t idxX =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
-          const size_t idxY =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
-          const size_t idxZ =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
-          const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX];
-          const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY];
-          const size_t kernel_size_z = m_kernelImpl.dimensions()[idxZ];
-          const size_t numX = dimensions()[m_indices[idxX]];
-          const size_t numY = dimensions()[m_indices[idxY]];
-          const size_t numZ = dimensions()[m_indices[idxZ]];
-          const size_t numP = dimensions().TotalSize() / (numX*numY*numZ);
-          const array<Index, 3> indices{{m_indices[idxX], m_indices[idxY], m_indices[idxZ]}};
-          const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[idxX],m_kernelImpl.dimensions()[idxY], m_kernelImpl.dimensions()[idxZ]}};
-          internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
-          size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z;
-          m_device.parallel_for_setup(numX, numY, numZ, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z );
-          const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * (tileSize_z +kernel_size_y -1);
-          assert(static_cast<unsigned long>(shared_mem) <= m_device.sharedMemPerBlock());
-          auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z);  // global range
-          auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z);  // local range
-          InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
-          cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range),
-          EigenConvolutionKernel3D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
-          InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
-          indexMapper,kernel_acc, kernel_size_x,  kernel_size_y, kernel_size_z, numX, numY,
-          numZ, numP, out_res, local_acc, input_functors, tuple_of_accessors));
-          break;
-        }
-
-        default: {
-          EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
-        }
-      }
-    });
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    eigen_assert(m_buf);
-    eigen_assert(index < m_dimensions.TotalSize());
-    return m_buf[index];
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
-  {
-    eigen_assert(m_buf);
-    eigen_assert(index < m_dimensions.TotalSize());
-    return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
-  costPerCoeff(bool vectorized) const {
-    // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
-    // model.
-    const double kernel_size = m_kernelImpl.dimensions().TotalSize();
-    // We ignore the use of fused multiply-add.
-    const double convolve_compute_cost =
-        TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
-    const double firstIndex_compute_cost =
-        NumDims *
-        (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
-         TensorOpCost::DivCost<Index>());
-    return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
-           kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
-                          m_kernelImpl.costPerCoeff(vectorized) +
-                          TensorOpCost(0, 0, convolve_compute_cost, vectorized,
-                                       PacketSize));
-  }
-
- private:
-  // No assignment (copies are needed by the kernels)
-  TensorEvaluator& operator = (const TensorEvaluator&);
-  TensorEvaluator<InputArgType, const Eigen::SyclDevice> m_inputImpl;
-  KernelArgType m_kernelArg;
-  TensorEvaluator<KernelArgType, const Eigen::SyclDevice> m_kernelImpl;
-  Indices m_indices;
-  Dimensions m_dimensions;
-  Scalar* m_buf;
-  const Scalar* m_kernel;
-  bool m_local_kernel;
-  const Eigen::SyclDevice& m_device;
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index be8d693..4f5767b 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -88,7 +88,7 @@ static void initializeDeviceProp() {
 #if __cplusplus >= 201103L
         std::atomic_thread_fence(std::memory_order_acquire);
 #endif
-        EIGEN_SLEEP(1000);
+        sleep(1);
       }
     }
   }
@@ -217,10 +217,7 @@ struct GpuDevice {
     EIGEN_UNUSED_VARIABLE(err)
     assert(err == cudaSuccess);
 #else
-    EIGEN_UNUSED_VARIABLE(dst);
-    EIGEN_UNUSED_VARIABLE(src);
-    EIGEN_UNUSED_VARIABLE(n);
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
+  eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
   }
 
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
index ccaaa6c..9d14139 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
@@ -45,7 +45,7 @@ struct DefaultDevice {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
-#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
+#ifndef __CUDA_ARCH__
     // Running on the host CPU
     return l1CacheSize();
 #else
@@ -55,7 +55,7 @@ struct DefaultDevice {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
-#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
+#ifndef __CUDA_ARCH__
     // Running single threaded on the host CPU
     return l3CacheSize();
 #else
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
index e209799..7c03989 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -16,400 +16,107 @@
 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
 
 namespace Eigen {
-
-  #define ConvertToActualTypeSycl(Scalar, buf_acc) reinterpret_cast<typename cl::sycl::global_ptr<Scalar>::pointer_t>((&(*buf_acc.get_pointer())))
-
-  template <typename Scalar, typename read_accessor, typename write_accessor> class MemCopyFunctor {
-  public:
-    MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset) : m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {}
-
-    void operator()(cl::sycl::nd_item<1> itemID) {
-      auto src_ptr = ConvertToActualTypeSycl(Scalar, m_src_acc);
-      auto dst_ptr = ConvertToActualTypeSycl(Scalar, m_dst_acc);
-      auto globalid = itemID.get_global_linear_id();
-      if (globalid < m_rng) {
-        dst_ptr[globalid + m_i] = src_ptr[globalid + m_offset];
-      }
-    }
-
-  private:
-    read_accessor m_src_acc;
-    write_accessor m_dst_acc;
-    size_t m_rng;
-    size_t m_i;
-    size_t m_offset;
-  };
-
-  struct memsetkernelFunctor{
-   typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> AccType;
-   AccType m_acc;
-   const size_t m_rng, m_c;
-   memsetkernelFunctor(AccType acc, const size_t rng, const size_t c):m_acc(acc), m_rng(rng), m_c(c){}
-   void operator()(cl::sycl::nd_item<1> itemID) {
-     auto globalid=itemID.get_global_linear_id();
-     if (globalid< m_rng) m_acc[globalid] = m_c;
-   }
-
-  };
-
-EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){
-  auto devices = cl::sycl::device::get_devices();
-  std::vector<cl::sycl::device>::iterator it =devices.begin();
-  while(it!=devices.end()) {
-    /// get_devices returns all the available opencl devices. Either use device_selector or exclude devices that computecpp does not support (AMD OpenCL for CPU )
-    auto s=  (*it).template get_info<cl::sycl::info::device::vendor>();
-    std::transform(s.begin(), s.end(), s.begin(), ::tolower);
-    if((*it).is_cpu() && s.find("amd")!=std::string::npos && s.find("apu") == std::string::npos){ // remove amd cpu as it is not supported by computecpp allow APUs
-      it=devices.erase(it);
-    }
-    else{
-      ++it;
-    }
-  }
-  return devices;
-}
-
-struct QueueInterface {
-  /// class members:
-  bool exception_caught_ = false;
-
-  mutable std::mutex mutex_;
-
+struct SyclDevice {
+  /// class members
+  /// sycl queue
+  mutable cl::sycl::queue m_queue;
   /// std::map is the container used to make sure that we create only one buffer
   /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice.
   /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it.
-  mutable std::map<const uint8_t *, cl::sycl::buffer<uint8_t, 1>> buffer_map;
-  /// sycl queue
-  mutable cl::sycl::queue m_queue;
-  /// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured through dev_Selector typename
-  /// SyclStreamDevice is not owned. it is the caller's responsibility to destroy it.
-  template<typename dev_Selector> explicit QueueInterface(const dev_Selector& s):
+  mutable std::map<const void *, std::shared_ptr<void>> buffer_map;
+  /// creating device by using selector
+  template<typename dev_Selector> SyclDevice(dev_Selector s)
+  :
 #ifdef EIGEN_EXCEPTIONS
-  m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
+  m_queue(cl::sycl::queue(s, [=](cl::sycl::exception_list l) {
     for (const auto& e : l) {
       try {
-        if (e) {
-           exception_caught_ = true;
-           std::rethrow_exception(e);
-        }
+        std::rethrow_exception(e);
       } catch (cl::sycl::exception e) {
-        std::cerr << e.what() << std::endl;
-      }
+          std::cout << e.what() << std::endl;
+        }
     }
   }))
 #else
-m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
-  for (const auto& e : l) {
-      if (e) {
-         exception_caught_ = true;
-         std::cerr << "Error detected Inside Sycl Device."<< std::endl;
-
-      }
-  }
-}))
+  m_queue(cl::sycl::queue(s))
 #endif
   {}
+  // destructor
+  ~SyclDevice() { deallocate_all(); }
 
-  /// Allocating device pointer. This pointer is actually an 8 bytes host pointer used as key to access the sycl device buffer.
-  /// The reason is that we cannot use device buffer as a pointer as a m_data in Eigen leafNode expressions. So we create a key
-  /// pointer to be used in Eigen expression construction. When we convert the Eigen construction into the sycl construction we
-  /// use this pointer as a key in our buffer_map and we make sure that we dedicate only one buffer only for this pointer.
-  /// The device pointer would be deleted by calling deallocate function.
-  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
-    auto buf = cl::sycl::buffer<uint8_t,1>(cl::sycl::range<1>(num_bytes));
-    auto ptr =buf.get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>().get_pointer();
-    buf.set_final_data(nullptr);
-    std::lock_guard<std::mutex> lock(mutex_);
-    buffer_map.insert(std::pair<const uint8_t *, cl::sycl::buffer<uint8_t, 1>>(static_cast<const uint8_t*>(ptr),buf));
-    return static_cast<void*>(ptr);
-  }
-
-  /// This is used to deallocate the device pointer. p is used as a key inside
-  /// the map to find the device buffer and delete it.
-  EIGEN_STRONG_INLINE void deallocate(void *p) const {
-    std::lock_guard<std::mutex> lock(mutex_);
-    auto it = buffer_map.find(static_cast<const uint8_t*>(p));
+  template <typename T> void deallocate(T *p) const {
+    auto it = buffer_map.find(p);
     if (it != buffer_map.end()) {
       buffer_map.erase(it);
+      internal::aligned_free(p);
     }
   }
-
-  EIGEN_STRONG_INLINE void deallocate_all() const {
-    std::lock_guard<std::mutex> lock(mutex_);
-    buffer_map.clear();
-  }
-
-  EIGEN_STRONG_INLINE std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1>>::iterator find_buffer(const void* ptr) const {
-    std::lock_guard<std::mutex> lock(mutex_);
-    auto it1 = buffer_map.find(static_cast<const uint8_t*>(ptr));
-    if (it1 != buffer_map.end()){
-      return it1;
-    }
-    else{
-      for(std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1>>::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){
-        auto size = it->second.get_size();
-        if((it->first <  (static_cast<const uint8_t*>(ptr))) && ((static_cast<const uint8_t*>(ptr)) < (it->first + size)) ) return it;
-      }
-    }
-    std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl;
-    abort();
-  }
-
-  // This function checks if the runtime recorded an error for the
-  // underlying stream device.
-  EIGEN_STRONG_INLINE bool ok() const {
-    if (!exception_caught_) {
-      m_queue.wait_and_throw();
+  void deallocate_all() const {
+    std::map<const void *, std::shared_ptr<void>>::iterator it=buffer_map.begin();
+    while (it!=buffer_map.end()) {
+      auto p=it->first;
+      buffer_map.erase(it);
+      internal::aligned_free(const_cast<void*>(p));
+      it=buffer_map.begin();
     }
-    return !exception_caught_;
+    buffer_map.clear();
   }
 
-  // destructor
-  ~QueueInterface() { buffer_map.clear(); }
-};
-
-struct SyclDevice {
-  // class member.
-  QueueInterface* m_queue_stream;
-  /// QueueInterface is not owned. it is the caller's responsibility to destroy it.
-  explicit SyclDevice(QueueInterface* queue_stream) : m_queue_stream(queue_stream){}
-
-  /// Creation of sycl accessor for a buffer. This function first tries to find
+  /// creation of sycl accessor for a buffer. This function first tries to find
   /// the buffer in the buffer_map. If found it gets the accessor from it, if not,
-  /// the function then adds an entry by creating a sycl buffer for that particular pointer.
-  template <cl::sycl::access::mode AcMd> EIGEN_STRONG_INLINE cl::sycl::accessor<uint8_t, 1, AcMd, cl::sycl::access::target::global_buffer>
-  get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const {
-    return (get_sycl_buffer(ptr).template get_access<AcMd, cl::sycl::access::target::global_buffer>(cgh));
+  ///the function then adds an entry by creating a sycl buffer for that particular pointer.
+  template <cl::sycl::access::mode AcMd, typename T> inline cl::sycl::accessor<T, 1, AcMd, cl::sycl::access::target::global_buffer>
+  get_sycl_accessor(size_t num_bytes, cl::sycl::handler &cgh, const T * ptr) const {
+    return (get_sycl_buffer<T>(num_bytes, ptr)->template get_access<AcMd, cl::sycl::access::target::global_buffer>(cgh));
   }
 
-  /// Accessing the created sycl device buffer for the device pointer
-  EIGEN_STRONG_INLINE cl::sycl::buffer<uint8_t, 1>& get_sycl_buffer(const void * ptr) const {
-    return m_queue_stream->find_buffer(ptr)->second;
+  template<typename T> inline  std::pair<std::map<const void *, std::shared_ptr<void>>::iterator,bool> add_sycl_buffer(const T *ptr, size_t num_bytes) const {
+    using Type = cl::sycl::buffer<T, 1>;
+    std::pair<std::map<const void *, std::shared_ptr<void>>::iterator,bool> ret = buffer_map.insert(std::pair<const void *, std::shared_ptr<void>>(ptr, std::shared_ptr<void>(new Type(cl::sycl::range<1>(num_bytes)),
+      [](void *dataMem) { delete static_cast<Type*>(dataMem); })));
+    (static_cast<Type*>(buffer_map.at(ptr).get()))->set_final_data(nullptr);
+    return ret;
   }
 
-  /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
-  template<typename Index>
-  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange)  const {
-    tileSize =static_cast<Index>(sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>());
-    auto s=  sycl_queue().get_device().template get_info<cl::sycl::info::device::vendor>();
-    std::transform(s.begin(), s.end(), s.begin(), ::tolower);
-    if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
-      tileSize=std::min(static_cast<Index>(256), static_cast<Index>(tileSize));
-    }
-    rng = n;
-    if (rng==0) rng=static_cast<Index>(1);
-    GRange=rng;
-    if (tileSize>GRange) tileSize=GRange;
-    else if(GRange>tileSize){
-      Index xMode =  static_cast<Index>(GRange % tileSize);
-      if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
-    }
-  }
-
-  /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
-  template<typename Index>
-  EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1)  const {
-    Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
-    if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
-      max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
-    }
-    Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
-    tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2)));
-    rng1=dim1;
-    if (rng1==0 ) rng1=static_cast<Index>(1);
-    GRange1=rng1;
-    if (tileSize1>GRange1) tileSize1=GRange1;
-    else if(GRange1>tileSize1){
-      Index xMode =  static_cast<Index>(GRange1 % tileSize1);
-      if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
-    }
-    tileSize0 = static_cast<Index>(max_workgroup_Size/tileSize1);
-    rng0 = dim0;
-    if (rng0==0 ) rng0=static_cast<Index>(1);
-    GRange0=rng0;
-    if (tileSize0>GRange0) tileSize0=GRange0;
-    else if(GRange0>tileSize0){
-      Index xMode =  static_cast<Index>(GRange0 % tileSize0);
-      if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
-    }
+  template <typename T> inline cl::sycl::buffer<T, 1>* get_sycl_buffer(size_t num_bytes,const T * ptr) const {
+    return static_cast<cl::sycl::buffer<T, 1>*>(add_sycl_buffer(ptr, num_bytes).first->second.get());
   }
 
-
-
-  /// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
-  template<typename Index>
-  EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2)  const {
-    Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
-    if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
-      max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
-    }
-    Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
-    tileSize2 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/3)));
-    rng2=dim2;
-    if (rng2==0 ) rng1=static_cast<Index>(1);
-    GRange2=rng2;
-    if (tileSize2>GRange2) tileSize2=GRange2;
-    else if(GRange2>tileSize2){
-      Index xMode =  static_cast<Index>(GRange2 % tileSize2);
-      if (xMode != 0) GRange2 += static_cast<Index>(tileSize2 - xMode);
-    }
-    pow_of_2 = static_cast<Index>(std::log2(static_cast<Index>(max_workgroup_Size/tileSize2)));
-    tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2)));
-    rng1=dim1;
-    if (rng1==0 ) rng1=static_cast<Index>(1);
-    GRange1=rng1;
-    if (tileSize1>GRange1) tileSize1=GRange1;
-    else if(GRange1>tileSize1){
-      Index xMode =  static_cast<Index>(GRange1 % tileSize1);
-      if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
-    }
-    tileSize0 = static_cast<Index>(max_workgroup_Size/(tileSize1*tileSize2));
-    rng0 = dim0;
-    if (rng0==0 ) rng0=static_cast<Index>(1);
-    GRange0=rng0;
-    if (tileSize0>GRange0) tileSize0=GRange0;
-    else if(GRange0>tileSize0){
-      Index xMode =  static_cast<Index>(GRange0 % tileSize0);
-      if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
-    }
-  }
-  /// allocate device memory
-  EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
-      return m_queue_stream->allocate(num_bytes);
+  /// allocating memory on the cpu
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void *allocate(size_t) const {
+    return internal::aligned_malloc(8);
   }
-  /// deallocate device memory
-  EIGEN_STRONG_INLINE void deallocate(void *p) const {
-     m_queue_stream->deallocate(p);
-   }
 
   // some runtime conditions that can be applied here
-  EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; }
+  bool isDeviceSuitable() const { return true; }
 
-  /// the memcpy function
-  template<typename Index> EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
-    auto it1 = m_queue_stream->find_buffer(static_cast<const void*>(src));
-    auto it2 = m_queue_stream->find_buffer(dst);
-    auto offset= (static_cast<const uint8_t*>(static_cast<const void*>(src))) - it1->first;
-    auto i= (static_cast<const uint8_t*>(dst)) - it2->first;
-    offset/=sizeof(Index);
-    i/=sizeof(Index);
-    size_t rng, GRange, tileSize;
-    parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange);
-    sycl_queue().submit([&](cl::sycl::handler &cgh) {
-      auto src_acc =it1->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
-      auto dst_acc =it2->second.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh);
-      typedef decltype(src_acc) read_accessor;
-      typedef decltype(dst_acc) write_accessor;
-      cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, i, offset));
-    });
-    synchronize();
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
+    ::memcpy(dst, src, n);
   }
 
-  /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device
-  /// pointer created as a key we find the sycl buffer and get the host accessor with discard_write mode
-  /// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the
-  /// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that
-  /// this buffer is accessed, the data will be copied to the device.
-  template<typename Index> EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const {
-    auto host_acc= get_sycl_buffer(dst). template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>();
-    ::memcpy(host_acc.get_pointer(), src, n);
+  template<typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(T *dst, const T *src, size_t n) const {
+    auto host_acc= (static_cast<cl::sycl::buffer<T, 1>*>(add_sycl_buffer(dst, n).first->second.get()))-> template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>();
+    memcpy(host_acc.get_pointer(), src, n);
   }
-  /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl
-  /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the
-  /// lifespan of the memcpyDeviceToHost function. We create a kernel to copy the data, from the device- only source buffer to the destination
-  /// buffer with map_allocator on the gpu in parallel. At the end of the function call the destination buffer would be destroyed and the data
-  /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back
-  /// to the cpu only once per function call.
-  template<typename Index> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const {
-    auto it = m_queue_stream->find_buffer(src);
-    auto offset =static_cast<const uint8_t*>(static_cast<const void*>(src))- it->first;
-    offset/=sizeof(Index);
-    size_t rng, GRange, tileSize;
-    parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange);
-    // Assuming that the dst is the start of the destination pointer
-    auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(n));
-    sycl_queue().submit([&](cl::sycl::handler &cgh) {
-      auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
-      auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
-      typedef decltype(src_acc) read_accessor;
-      typedef decltype(dst_acc) write_accessor;
-      cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, 0, offset));
-    });
-    synchronize();
-  }
-  /// returning the sycl queue
-  EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;}
-  /// Here is the implementation of memset function on sycl.
-  EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
-    size_t rng, GRange, tileSize;
-    parallel_for_setup(n, tileSize, rng, GRange);
-    sycl_queue().submit(memsetCghFunctor(get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(data))),rng, GRange, tileSize, c ));
-    synchronize();
-  }
-
-  struct memsetCghFunctor{
-    cl::sycl::buffer<uint8_t, 1>& m_buf;
-    const size_t& rng , GRange, tileSize;
-    const int  &c;
-    memsetCghFunctor(cl::sycl::buffer<uint8_t, 1>& buff,  const size_t& rng_,  const size_t& GRange_,  const size_t& tileSize_, const int& c_)
-    :m_buf(buff), rng(rng_), GRange(GRange_), tileSize(tileSize_), c(c_){}
-
-    void operator()(cl::sycl::handler &cgh) const {
-      auto buf_acc = m_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
-      cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), memsetkernelFunctor(buf_acc, rng, c));
+ /// whith the current implementation of sycl, the data is copied twice from device to host. This will be fixed soon.
+  template<typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(T *dst, const T *src, size_t n) const {
+    auto it = buffer_map.find(src);
+    if (it != buffer_map.end()) {
+      auto host_acc= (static_cast<cl::sycl::buffer<T, 1>*>(it->second.get()))-> template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::host_buffer>();
+      memcpy(dst,host_acc.get_pointer(),  n);
+    } else{
+      eigen_assert("no device memory found. The memory might be destroyed before creation");
     }
-  };
-
-  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
-    // FIXME
-    return 48*1024;
-  }
-
-  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
-    // We won't try to take advantage of the l2 cache for the time being, and
-    // there is no l3 cache on cuda devices.
-    return firstLevelCacheSize();
-  }
-  EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
-    return sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_compute_units>();
-  //  return stream_->deviceProperties().multiProcessorCount;
   }
-  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
-    return sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>();
 
-  //  return stream_->deviceProperties().maxThreadsPerBlock;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void *buffer, int c, size_t n) const {
+    ::memset(buffer, c, n);
   }
-  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
-    // OpenCL doesnot have such concept
-    return 2;//sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>();
-  //  return stream_->deviceProperties().maxThreadsPerMultiProcessor;
-  }
-  EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
-    return sycl_queue().get_device(). template get_info<cl::sycl::info::device::local_mem_size>();
-  //  return stream_->deviceProperties().sharedMemPerBlock;
-  }
-  /// No need for sycl it should act the same as CPU version
-  EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
-
-  EIGEN_STRONG_INLINE void synchronize() const {
-    sycl_queue().wait_and_throw(); //pass
-  }
-
-  EIGEN_STRONG_INLINE void asynchronousExec() const {
-    ///FIXEDME:: currently there is a race condition regarding the asynch scheduler.
-    //sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled
-    sycl_queue().wait_and_throw(); //pass
-
-  }
-  // This function checks if the runtime recorded an error for the
-  // underlying stream device.
-  EIGEN_STRONG_INLINE bool ok() const {
-    return m_queue_stream->ok();
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+  return 1;
   }
 };
 
-
-
 }  // end namespace Eigen
 
 #endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index 16180ca..069680a 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -12,6 +12,17 @@
 
 namespace Eigen {
 
+// Use the SimpleThreadPool by default. We'll switch to the new non blocking
+// thread pool later.
+#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
+template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
+typedef NonBlockingThreadPool ThreadPool;
+#else
+template <typename Env> using ThreadPoolTempl = SimpleThreadPoolTempl<Env>;
+typedef SimpleThreadPool ThreadPool;
+#endif
+
+
 // Barrier is an object that allows one or more threads to wait until
 // Notify has been called a specified number of times.
 class Barrier {
@@ -245,7 +256,7 @@ struct ThreadPoolDevice {
       // Split into halves and submit to the pool.
       Index mid = first + divup((last - first) / 2, block_size) * block_size;
       pool_->Schedule([=, &handleRange]() { handleRange(mid, last); });
-      handleRange(first, mid);
+      pool_->Schedule([=, &handleRange]() { handleRange(first, mid); });
     };
     handleRange(0, n);
     barrier.Wait();
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index 86405e6..b24cdeb 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -33,7 +33,7 @@ namespace Eigen {
 namespace internal {
 
 template<std::size_t n, typename Dimension> struct dget {
-  static const std::ptrdiff_t value = get<n, Dimension>::value;
+  static const std::size_t value = get<n, Dimension>::value;
 };
 
 
@@ -90,11 +90,9 @@ struct fixed_size_tensor_index_extraction_helper<Index, 0>
 // Fixed size
 #ifndef EIGEN_EMULATE_CXX11_META_H
 template <typename std::ptrdiff_t... Indices>
-struct Sizes {
+struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> {
   typedef internal::numeric_list<std::ptrdiff_t, Indices...> Base;
-  const Base t = Base();
   static const std::ptrdiff_t total_size = internal::arg_prod(Indices...);
-  static const size_t count = Base::count;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const {
     return Base::count;
@@ -122,16 +120,16 @@ struct Sizes {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::size_t index) const {
-    return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, t);
+    return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, *this);
   }
 
   template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
-    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, t);
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, *static_cast<const Base*>(this));
   }
   template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
-    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, t);
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, *static_cast<const Base*>(this));
   }
 };
 
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index 82dd1e6..0698713 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -41,9 +41,6 @@ struct traits<TensorEvalToOp<XprType, MakePointer_> >
     // Intermediate typedef to workaround MSVC issue.
     typedef MakePointer_<T> MakePointerT;
     typedef typename MakePointerT::Type Type;
-    typedef typename MakePointerT::RefType RefType;
-
-
   };
 };
 
@@ -120,7 +117,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& op() const {
     return m_op;
   }
-
+  
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {
   }
 
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index d641581..834ce07 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -32,7 +32,6 @@ struct TensorEvaluator
   typedef typename Derived::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef typename Derived::Dimensions Dimensions;
-  typedef Derived XprType;
 
   // NumDimensions is -1 for variable dim tensors
   static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
@@ -69,9 +68,7 @@ struct TensorEvaluator
     return m_data[index];
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  typename internal::traits<Derived>::template MakePointer<Scalar>::RefType
-   coeffRef(Index index) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
     eigen_assert(m_data);
     return m_data[index];
   }
@@ -97,9 +94,7 @@ struct TensorEvaluator
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  typename internal::traits<Derived>::template MakePointer<Scalar>::RefType
-  coeffRef(const array<DenseIndex, NumCoords>& coords) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<DenseIndex, NumCoords>& coords) {
     eigen_assert(m_data);
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       return m_data[m_dims.IndexOfColMajor(coords)];
@@ -157,8 +152,6 @@ struct TensorEvaluator<const Derived, Device>
   typedef typename Derived::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef typename Derived::Dimensions Dimensions;
-  typedef const Derived XprType;
-
 
   // NumDimensions is -1 for variable dim tensors
   static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index f060191..08eb559 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -253,7 +253,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
         // get data into line_buf
         const Index stride = m_strides[dim];
         if (stride == 1) {
-          m_device.memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar));
+          memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar));
         } else {
           Index offset = base_offset;
           for (int j = 0; j < line_len; ++j, offset += stride) {
@@ -271,7 +271,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
 
         // write back
         if (FFTDir == FFT_FORWARD && stride == 1) {
-          m_device.memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar));
+          memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar));
         } else {
           Index offset = base_offset;
           const ComplexScalar div_factor =  ComplexScalar(1.0 / line_len, 0);
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index abe85c8..bbd5eb3 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -26,8 +26,8 @@ namespace Eigen {
 /// Therefore, by adding the default value, we managed to convert the type and it does not break any
 /// existing code as its default value is T*.
 namespace internal {
-template<typename XprType>
-struct traits<TensorForcedEvalOp<XprType> >
+template<typename XprType, template <class> class MakePointer_>
+struct traits<TensorForcedEvalOp<XprType, MakePointer_> >
 {
   // Type promotion to handle the case where the types of the lhs and the rhs are different.
   typedef typename XprType::Scalar Scalar;
@@ -42,26 +42,31 @@ struct traits<TensorForcedEvalOp<XprType> >
   enum {
     Flags = 0
   };
+  template <class T> struct MakePointer {
+    // Intermediate typedef to workaround MSVC issue.
+    typedef MakePointer_<T> MakePointerT;
+    typedef typename MakePointerT::Type Type;
+  };
 };
 
-template<typename XprType>
-struct eval<TensorForcedEvalOp<XprType>, Eigen::Dense>
+template<typename XprType, template <class> class MakePointer_>
+struct eval<TensorForcedEvalOp<XprType, MakePointer_>, Eigen::Dense>
 {
-  typedef const TensorForcedEvalOp<XprType>& type;
+  typedef const TensorForcedEvalOp<XprType, MakePointer_>& type;
 };
 
-template<typename XprType>
-struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<XprType> >::type>
+template<typename XprType, template <class> class MakePointer_>
+struct nested<TensorForcedEvalOp<XprType, MakePointer_>, 1, typename eval<TensorForcedEvalOp<XprType, MakePointer_> >::type>
 {
-  typedef TensorForcedEvalOp<XprType> type;
+  typedef TensorForcedEvalOp<XprType, MakePointer_> type;
 };
 
 }  // end namespace internal
 
 
 
-template<typename XprType>
-class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOnlyAccessors>
+template<typename XprType, template <class> class MakePointer_>
+class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType, MakePointer_>, ReadOnlyAccessors>
 {
   public:
   typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar;
@@ -83,10 +88,10 @@ class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOn
 };
 
 
-template<typename ArgType, typename Device>
-struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
+template<typename ArgType, typename Device, template <class> class MakePointer_>
+struct TensorEvaluator<const TensorForcedEvalOp<ArgType, MakePointer_>, Device>
 {
-  typedef TensorForcedEvalOp<ArgType> XprType;
+  typedef TensorForcedEvalOp<ArgType, MakePointer_> XprType;
   typedef typename ArgType::Scalar Scalar;
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
   typedef typename XprType::Index Index;
@@ -102,7 +107,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
   };
 
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
-  /// op_ is used for sycl
+	/// op_ is used for sycl
       : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL)
   { }
 
@@ -143,17 +148,17 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return m_buffer; }
+  EIGEN_DEVICE_FUNC typename MakePointer<Scalar>::Type data() const { return m_buffer; }
 
   /// required by sycl in order to extract the sycl accessor
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() { return m_impl; }
+  const TensorEvaluator<ArgType, Device>& impl() { return m_impl; }
   /// used by sycl in order to build the sycl buffer
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;}
+  const Device& device() const{return m_device;}
  private:
   TensorEvaluator<ArgType, Device> m_impl;
   const ArgType m_op;
   const Device& m_device;
-  CoeffReturnType* m_buffer;
+  typename MakePointer<CoeffReturnType>::Type m_buffer;
 };
 
 
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index 2e63899..52b803d 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -20,19 +20,7 @@ namespace Eigen {
 // map_allocator.
 template<typename T> struct MakePointer {
   typedef T* Type;
-  typedef T& RefType;
 };
-#if defined(EIGEN_USE_SYCL)
-namespace TensorSycl {
-namespace internal{
-template <typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor;
-template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Op, typename Dims, typename Index, typename TupleType>
-class FullReductionKernelFunctor;
-}
-}
-#endif
-
-
 
 template<typename PlainObjectType, int Options_ = Unaligned, template <class> class MakePointer_ = MakePointer> class TensorMap;
 template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType = DenseIndex> class Tensor;
@@ -75,7 +63,7 @@ template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp;
 template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp;
 
 template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorEvalToOp;
-template<typename XprType> class TensorForcedEvalOp;
+template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorForcedEvalOp;
 
 template<typename ExpressionType, typename DeviceType> class TensorDevice;
 template<typename Derived, typename Device> struct TensorEvaluator;
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index 3b4f8ed..d73f6dc 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -33,7 +33,7 @@ struct functor_traits<scalar_mod_op<Scalar> >
  */
 template <typename Scalar>
 struct scalar_mod2_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op)
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op);
   EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; }
 };
 template <typename Scalar>
@@ -42,7 +42,7 @@ struct functor_traits<scalar_mod2_op<Scalar> >
 
 template <typename Scalar>
 struct scalar_fmod_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op)
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op);
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
   operator()(const Scalar& a, const Scalar& b) const {
     return numext::fmod(a, b);
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index ef1c9c4..ede3939 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -37,8 +37,6 @@ namespace {
   {
 #ifdef __CUDA_ARCH__
     return __clz(val);
-#elif defined(__SYCL_DEVICE_ONLY__)
-    return cl::sycl::clz(val);
 #elif EIGEN_COMP_MSVC
     unsigned long index;
     _BitScanReverse(&index, val);
@@ -55,8 +53,6 @@ namespace {
   {
 #ifdef __CUDA_ARCH__
     return __clzll(val);
-#elif defined(__SYCL_DEVICE_ONLY__)
-    return cl::sycl::clz(val);
 #elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64
     unsigned long index;
     _BitScanReverse64(&index, val);
@@ -92,8 +88,6 @@ namespace {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
 #if defined(__CUDA_ARCH__)
     return __umulhi(a, b);
-#elif defined(__SYCL_DEVICE_ONLY__)
-    return cl::sycl::mul_hi(a, static_cast<uint32_t>(b));
 #else
     return (static_cast<uint64_t>(a) * b) >> 32;
 #endif
@@ -103,8 +97,6 @@ namespace {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
 #if defined(__CUDA_ARCH__)
     return __umul64hi(a, b);
-#elif defined(__SYCL_DEVICE_ONLY__)
-    return cl::sycl::mul_hi(a, static_cast<uint64_t>(b));
 #elif defined(__SIZEOF_INT128__)
     __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b);
     return static_cast<uint64_t>(v >> 64);
@@ -124,7 +116,7 @@ namespace {
   template <typename T>
   struct DividerHelper<64, T> {
     static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
-#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
+#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__)
       return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
 #else
       const uint64_t shift = 1ULL << log_div;
@@ -205,8 +197,6 @@ class TensorIntDivisor<int32_t, true> {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
 #ifdef __CUDA_ARCH__
     return (__umulhi(magic, n) >> shift);
-#elif defined(__SYCL_DEVICE_ONLY__)
-    return (cl::sycl::mul_hi(static_cast<uint64_t>(magic), static_cast<uint64_t>(n)) >> shift);
 #else
     uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n);
     return (static_cast<uint32_t>(v >> 32) >> shift);
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
index f92e39d..ee0078b 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
@@ -51,12 +51,4 @@
 #endif
 
 
-#if EIGEN_OS_WIN || EIGEN_OS_WIN64
-#define EIGEN_SLEEP(n) Sleep(n)
-#elif EIGEN_OS_GNULINUX
-#define EIGEN_SLEEP(n) usleep(n * 1000);
-#else
-#define EIGEN_SLEEP(n) sleep(std::max<unsigned>(1, n/1000))
-#endif
-
 #endif
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index b5ef31d..615559d 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -75,7 +75,6 @@ struct PacketType<half, GpuDevice> {
     HasSqrt   = 1,
     HasRsqrt  = 1,
     HasExp    = 1,
-    HasExpm1  = 0,
     HasLog    = 1,
     HasLog1p  = 0,
     HasLog10  = 0,
@@ -169,12 +168,12 @@ template <typename Idx> struct IndexPair {
 #ifdef EIGEN_HAS_SFINAE
 namespace internal {
 
-  template<typename IndexType, typename Index, Index... Is>
+  template<typename IndexType, Index... Is>
   EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   array<Index, sizeof...(Is)> customIndices2Array(IndexType& idx, numeric_list<Index, Is...>) {
     return { idx[Is]... };
   }
-  template<typename IndexType, typename Index>
+  template<typename IndexType>
   EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   array<Index, 0> customIndices2Array(IndexType&, numeric_list<Index>) {
     return array<Index, 0>();
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 6ddd2ca..d34f1e3 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -299,16 +299,6 @@ template <typename Index> struct MemcpyTriggerForSlicing<Index, GpuDevice>  {
   EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; }
 };
 #endif
-
-// It is very expensive to start the memcpy kernel on GPU: we therefore only
-// use it for large copies.
-#ifdef EIGEN_USE_SYCL
-template <typename Index> struct MemcpyTriggerForSlicing<Index, const Eigen::SyclDevice>  {
-  EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) { }
-  EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; }
-};
-#endif
-
 }
 
 // Eval as rvalue
@@ -503,14 +493,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
     }
     return NULL;
   }
-  /// used by sycl
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const{
-    return m_impl;
-  }
-  /// used by sycl
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& startIndices() const{
-    return m_offsets;
-  }
+
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
   {
@@ -711,12 +694,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
 {
   typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
   static const int NumDims = internal::array_size<Strides>::value;
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef Strides Dimensions;
 
   enum {
     // Alignment can't be guaranteed at compile time since it depends on the
@@ -729,7 +706,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_device(device), m_strides(op.strides()), m_exprStartIndices(op.startIndices()), m_exprStopIndices(op.stopIndices())
+      : m_impl(op.expression(), device), m_device(device), m_strides(op.strides())
   {
     // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero
     DSizes<Index,NumDims> startIndicesClamped, stopIndicesClamped;
@@ -739,7 +716,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
         startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
         stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]);
       }else{
-      /* implies m_strides[i]<0 by assert */
+        /* implies m_strides[i]<0 by assert */
         startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1);
         stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1);
       }
@@ -802,6 +779,13 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
                                           sizeof(Scalar));
   }
 
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef Strides Dimensions;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
 
@@ -827,15 +811,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
     return NULL;
   }
 
-  //use by sycl
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const StartIndices& exprStartIndices() const { return m_exprStartIndices; }
-  //use by sycl
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE  const StartIndices& exprStopIndices() const { return m_exprStopIndices; }
-  //use by sycl
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE  const StartIndices& strides() const { return m_strides; }
-  /// used by sycl
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const{return m_impl;}
-
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
   {
@@ -857,11 +832,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
   }
 
   static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
-#ifndef __SYCL_DEVICE_ONLY__
     return numext::maxi(min, numext::mini(max,value));
-#else
-    return cl::sycl::clamp(value, min, max);
-#endif
   }
 
   array<Index, NumDims> m_outputStrides;
@@ -874,10 +845,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
   DSizes<Index, NumDims> m_offsets; // offset in a flattened shape
   const Strides m_strides;
   std::size_t m_block_total_size_max;
-  //use by sycl
-  const StartIndices m_exprStartIndices;
-  //use by sycl
-  const StopIndices m_exprStopIndices;
 };
 
 // Eval as lvalue
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index a8e2552..647bcf1 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -200,13 +200,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
-  /// used by sycl
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PaddingDimensions& padding() const { return m_padding; }
-  /// used by sycl
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& padding_value() const { return m_paddingValue; }
-  /// used by sycl
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const{return m_impl;}
-
  private:
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim(
       Index index, int dim_index) const {
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index e341e2e..41d0d00 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -11,20 +11,8 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
 
-// clang is incompatible with the CUDA syntax wrt making a kernel a class friend,
-// so we'll use a macro to make clang happy.
-#ifndef KERNEL_FRIEND
-#if defined(__clang__) && defined(__CUDA__)
-#define KERNEL_FRIEND friend __global__
-#else
-#define KERNEL_FRIEND friend
-#endif
-#endif
-
-
 namespace Eigen {
 
-
 /** \class TensorReduction
   * \ingroup CXX11_Tensor_Module
   *
@@ -692,23 +680,17 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
   template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
 #endif
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-  template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
+  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
 #ifdef EIGEN_HAS_CUDA_FP16
-  template <typename S, typename R, typename I> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
-  template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
-  template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
-#endif
-  template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
-
-  template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+  template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
+  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
+  template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
 #endif
+  template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
 
-#if defined(EIGEN_USE_SYCL)
- template < typename HostExpr_, typename FunctorExpr_, typename Tuple_of_Acc_, typename Dims_, typename Op_, typename Index_> friend class TensorSycl::internal::ReductionFunctor;
- template<typename CoeffReturnType_ ,typename OutAccessor_, typename HostExpr_, typename FunctorExpr_, typename Op_, typename Dims_, typename Index_, typename TupleType_> friend class TensorSycl::internal::FullReductionKernelFunctor;
+  template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
 #endif
 
-
   template <typename S, typename O, typename D> friend struct internal::InnerReducer;
 
   // Returns the Index in the input tensor of the first value that needs to be
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index edb0ab2..65638b6 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -287,6 +287,7 @@ struct FullReductionLauncher<
     void>::type> {
   static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
     typedef typename Self::Index Index;
+    typedef typename Self::CoeffReturnType Scalar;
     const int block_size = 256;
     const int num_per_thread = 128;
     const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
index c3ca129..3daecb0 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@@ -25,28 +25,61 @@
 namespace Eigen {
 namespace internal {
 
-template<typename OP, typename CoeffReturnType> struct syclGenericBufferReducer{
+template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{
 template<typename BufferTOut, typename BufferTIn>
-static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
+static void run(BufferTOut* bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
   do {
-          auto f = [length, local, op, &bufOut, &bufI](cl::sycl::handler& h) mutable {
+          auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable {
             cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
                                     cl::sycl::range<1>{std::min(length, local)}};
             /* Two accessors are used: one to the buffer that is being reduced,
              * and a second to local memory, used to store intermediate data. */
-            auto aI =bufI.template get_access<cl::sycl::access::mode::read_write>(h);
-            auto aOut =bufOut.template get_access<cl::sycl::access::mode::discard_write>(h);
-            typedef decltype(aI) InputAccessor;
-            typedef decltype(aOut) OutputAccessor;
-            typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,cl::sycl::access::target::local> LocalAccessor;
-            LocalAccessor scratch(cl::sycl::range<1>(local), h);
+            auto aI =
+                bufI.template get_access<cl::sycl::access::mode::read_write>(h);
+            auto aOut =
+                bufOut->template get_access<cl::sycl::access::mode::discard_write>(h);
+            cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,
+                               cl::sycl::access::target::local>
+                scratch(cl::sycl::range<1>(local), h);
 
             /* The parallel_for invocation chosen is the variant with an nd_item
              * parameter, since the code requires barriers for correctness. */
-            h.parallel_for(r, TensorSycl::internal::GenericKernelReducer<CoeffReturnType, OP, OutputAccessor, InputAccessor, LocalAccessor>(op, aOut, aI, scratch,  length, local));
+            h.parallel_for<KernelName>(
+                r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) {
+                  size_t globalid = id.get_global(0);
+                  size_t localid = id.get_local(0);
+                  /* All threads collectively read from global memory into local.
+                   * The barrier ensures all threads' IO is resolved before
+                   * execution continues (strictly speaking, all threads within
+                   * a single work-group - there is no co-ordination between
+                   * work-groups, only work-items). */
+                  if (globalid < length) {
+                    scratch[localid] = aI[globalid];
+                  }
+                  id.barrier(cl::sycl::access::fence_space::local_space);
+
+                  /* Apply the reduction operation between the current local
+                   * id and the one on the other half of the vector. */
+                  if (globalid < length) {
+                    int min = (length < local) ? length : local;
+                    for (size_t offset = min / 2; offset > 0; offset /= 2) {
+                      if (localid < offset) {
+                        scratch[localid] += scratch[localid + offset];
+                      }
+                      id.barrier(cl::sycl::access::fence_space::local_space);
+                    }
+                    /* The final result will be stored in local id 0. */
+                    if (localid == 0) {
+                      aI[id.get_group(0)] = scratch[localid];
+                      if((length<=local) && globalid ==0){
+                        aOut[globalid]=scratch[localid];
+                      }
+                    }
+                  }
+                });
           };
-            dev.sycl_queue().submit(f);
-            dev.asynchronousExec();
+            dev.m_queue.submit(f);
+            dev.m_queue.throw_asynchronous();
 
           /* At this point, you could queue::wait_and_throw() to ensure that
            * errors are caught quickly. However, this would likely impact
@@ -54,23 +87,18 @@ static void run(OP op, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDev
           length = length / local;
 
         } while (length > 1);
-}
 
-};
 
-template<typename CoeffReturnType> struct syclGenericBufferReducer<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType>{
-template<typename BufferTOut, typename BufferTIn>
-static void run(Eigen::internal::MeanReducer<CoeffReturnType>, BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
-   syclGenericBufferReducer<Eigen::internal::SumReducer<CoeffReturnType>, CoeffReturnType>::run(Eigen::internal::SumReducer<CoeffReturnType>(),
-    bufOut, bufI, dev, length, local);
+
 }
+
 };
 
+/// For now let's start with a full reducer
 /// Self is useless here because in expression construction we are going to treat reduction as a leafnode.
 /// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the
 /// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as
 // a leafNode.
-
 template <typename Self, typename Op, bool Vectorizable>
 struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
 
@@ -79,8 +107,8 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
 
   static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
     typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
-    typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr;
-    FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl());
+    typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
+    auto functors = TensorSycl::internal::extractFunctors(self.impl());
     int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread.
     size_t inputSize =self.impl().dimensions().TotalSize();
     size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input
@@ -88,7 +116,7 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
     if(rng ==0) {
       red_factor=1;
     };
-    size_t tileSize =dev.sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
+    size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
     size_t GRange=std::max((size_t )1, rng);
 
     // convert global range to power of 2 for redecution
@@ -105,66 +133,105 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
     size_t  outTileSize = tileSize;
     /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one.
     if (GRange < outTileSize) outTileSize=GRange;
+    // getting final out buffer at the moment the created buffer is true because there is no need for assign
+    auto out_buffer =dev.template get_sycl_buffer<typename Eigen::internal::remove_all<CoeffReturnType>::type>(self.dimensions().TotalSize(), output);
     /// creating the shared memory for calculating reduction.
     /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
     /// recursively apply reduction on it in order to reduce the whole.
     auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
     typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
-  //  Dims dims= self.xprDims();
-    //Op functor = reducer;
-    dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
-      // this is a workaround for gcc 4.8 bug
-      typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) TupleType;
+    Dims dims= self.xprDims();
+    Op functor = reducer;
+    dev.m_queue.submit([&](cl::sycl::handler &cgh) {
       // create a tuple of accessors from Evaluator
-      TupleType tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
+      auto tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
       auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
-      typedef decltype(tmp_global_accessor) OutAccessor;
-      cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)),
-        TensorSycl::internal::FullReductionKernelFunctor<CoeffReturnType, OutAccessor, HostExpr, FunctorExpr, Op, Dims, size_t, TupleType>
-       (tmp_global_accessor, rng, remaining, red_factor, reducer, self.xprDims(), functors,  tuple_of_accessors));
+
+      cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) {
+        typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
+        auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+        /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
+        /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
+        /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
+        const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
+        /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
+        /// the device_evaluator is detectable and recognisable on the device.
+        auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
+        /// const cast added as a naive solution to solve the qualifier drop error
+        auto globalid=itemID.get_global_linear_id();
+
+        if(globalid<rng)
+          tmp_global_accessor.get_pointer()[globalid]=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*globalid, red_factor, const_cast<Op&>(functor));
+        else
+          tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);
+
+        if(remaining!=0 && globalid==0 )
+          // this will add the rest of input buffer when the input size is not devidable to red_factor.
+          tmp_global_accessor.get_pointer()[globalid]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*(rng), remaining, const_cast<Op&>(functor));
+      });
     });
-    dev.asynchronousExec();
+  dev.m_queue.throw_asynchronous();
 
-    // getting final out buffer at the moment the created buffer is true because there is no need for assign
-    auto out_buffer =dev.get_sycl_buffer(output);
-    /// This is used to recursively reduce the tmp value to an element of 1;
-    syclGenericBufferReducer<Op, CoeffReturnType>::run(reducer, out_buffer, temp_global_buffer,dev, GRange,  outTileSize);
+/// This is used to recursively reduce the tmp value to an element of 1;
+  syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange,  outTileSize);
   }
 
 };
 
-
 template <typename Self, typename Op>
 struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
 
   typedef typename Self::CoeffReturnType CoeffReturnType;
   static const bool HasOptimizedImplementation = false;
 
-  static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index num_values_to_reduce, typename Self::Index num_coeffs_to_preserve) {
+  static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) {
     typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
-    typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr;
-    FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl());
-    typename Self::Index range, GRange, tileSize;
-    typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
+    typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
+    auto functors = TensorSycl::internal::extractFunctors(self.impl());
+
+    size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
 
+    size_t GRange=num_coeffs_to_preserve;
+    if (tileSize>GRange) tileSize=GRange;
+    else if(GRange>tileSize){
+      size_t xMode = GRange % tileSize;
+      if (xMode != 0) GRange += (tileSize - xMode);
+    }
     // getting final out buffer at the moment the created buffer is true because there is no need for assign
     /// creating the shared memory for calculating reduction.
     /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
     /// recursively apply reduction on it in order to reduce the whole.
-      dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
-      dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
-      // this is workaround for gcc 4.8 bug.
-      typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc;
-      // create a tuple of accessors from Evaluator
-      Tuple_of_Acc tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
-      auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output);
-      Index red_size = (num_values_to_reduce!=0)? num_values_to_reduce : static_cast<Index>(1);
-      cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
-      TensorSycl::internal::ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Op, typename Self::Index>
-      (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range, red_size));
+    typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
+    Dims dims= self.xprDims();
+    Op functor = reducer;
 
+    dev.m_queue.submit([&](cl::sycl::handler &cgh) {
+      // create a tuple of accessors from Evaluator
+      auto tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
+      auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(num_coeffs_to_preserve,cgh, output);
+
+      cgh.parallel_for<Self>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
+        typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
+        auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+        /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
+        /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
+        /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
+        const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
+        /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
+        /// the device_evaluator is detectable and recognisable on the device.
+        typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeiceSelf;
+        auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
+        /// const cast added as a naive solution to solve the qualifier drop error
+        auto globalid=itemID.get_global_linear_id();
+        if (globalid< static_cast<size_t>(num_coeffs_to_preserve)) {
+          typename DeiceSelf::CoeffReturnType accum = functor.initialize();
+          GenericDimReducer<DeiceSelf::NumReducedDims-1, DeiceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(globalid),const_cast<Op&>(functor), &accum);
+          functor.finalize(accum);
+          output_accessor.get_pointer()[globalid]= accum;
+        }
+      });
     });
-    dev.asynchronousExec();
+  dev.m_queue.throw_asynchronous();
     return false;
   }
 };
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index e430b08..14e392e 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -224,11 +224,6 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
 
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<ArgType, Device> & impl() const { return m_impl; }
-  /// added for sycl in order to construct the buffer from sycl device
-  ReverseDimensions functor() const { return m_reverse; }
-
  protected:
   Dimensions m_dimensions;
   array<Index, NumDims> m_strides;
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index edc9dd3..113c060 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -117,7 +117,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_shuffle(op.shufflePermutation())
+      : m_impl(op.expression(), device)
   {
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
     const Shuffle& shuffle = op.shufflePermutation();
@@ -187,11 +187,6 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
-  // required by sycl
-  EIGEN_STRONG_INLINE const Shuffle& shufflePermutation() const {return m_shuffle;}
-  // required by sycl
-  EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const {return m_impl;}
-
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
     Index inputIndex = 0;
@@ -211,12 +206,11 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
       return inputIndex + index * m_inputStrides[NumDims - 1];
     }
   }
+
   Dimensions m_dimensions;
   array<Index, NumDims> m_outputStrides;
   array<Index, NumDims> m_inputStrides;
   TensorEvaluator<ArgType, Device> m_impl;
-  /// required by sycl
-  Shuffle m_shuffle;
 };
 
 
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
index e6a666f..2854a4a 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@@ -31,12 +31,12 @@ namespace Eigen {
   *
   * \sa Tensor
   */
-template<typename T, typename Dimensions, int Options> class TensorStorage;
+template<typename T, typename Dimensions, int Options_> class TensorStorage;
 
 
 // Pure fixed-size storage
-template<typename T, typename FixedDimensions, int Options_>
-class TensorStorage
+template<typename T, int Options_, typename FixedDimensions>
+class TensorStorage<T, FixedDimensions, Options_>
 {
  private:
   static const std::size_t Size = FixedDimensions::total_size;
@@ -66,7 +66,7 @@ class TensorStorage
 
 
 // pure dynamic
-template<typename T, typename IndexType, int NumIndices_, int Options_>
+template<typename T, int Options_, typename IndexType, int NumIndices_>
 class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
 {
   public:
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 2237140..6c35bfd 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -117,11 +117,11 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_strides(op.strides())
+      : m_impl(op.expression(), device)
   {
     m_dimensions = m_impl.dimensions();
     for (int i = 0; i < NumDims; ++i) {
-      m_dimensions[i] =Eigen::numext::ceil(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
+      m_dimensions[i] = ceilf(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
     }
 
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
@@ -224,11 +224,6 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
-  /// required by sycl in order to extract the accessor
-  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
-  /// required by sycl in order to extract the accessor
-  Strides functor() const { return m_strides; }
-
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
   {
@@ -255,9 +250,9 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
   array<Index, NumDims> m_outputStrides;
   array<Index, NumDims> m_inputStrides;
   TensorEvaluator<ArgType, Device> m_impl;
-  const Strides m_strides;
 };
 
+
 // Eval as lvalue
 template<typename Strides, typename ArgType, typename Device>
 struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
@@ -291,11 +286,6 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
     return this->m_impl.coeffRef(this->srcCoeff(index));
   }
 
-  /// required by sycl in order to extract the accessor
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return this->m_impl; }
-  /// required by sycl in order to extract the accessor
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Strides functor() const { return this->m_strides; }
-
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
index 9d5a6d4..bb8800d 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
@@ -20,14 +20,12 @@
 template <class T>
 struct MakeGlobalPointer {
   typedef typename cl::sycl::global_ptr<T>::pointer_t Type;
-  typedef typename cl::sycl::global_ptr<T>::reference_t RefType;
 };
 
 // global pointer to set different attribute state for a class
 template <class T>
 struct MakeLocalPointer {
   typedef typename cl::sycl::local_ptr<T>::pointer_t Type;
-  typedef typename cl::sycl::local_ptr<T>::reference_t RefType;
 };
 
 
@@ -35,9 +33,6 @@ namespace Eigen {
 namespace TensorSycl {
 namespace internal {
 
-  template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer;
-
-
 /// This struct is used for special expression nodes with no operations (for example assign and selectOP).
   struct NoOP;
 
@@ -80,15 +75,8 @@ template<typename T> struct GetType<false, T>{
 /// this is used for extracting tensor reduction
 #include "TensorReductionSycl.h"
 
-/// this is used for extracting tensor convolution
-#include "TensorConvolutionSycl.h"
-
 // kernel execution using fusion
 #include "TensorSyclRun.h"
-//sycl functors
-#include "TensorSyclFunctors.h"
-
-#include "TensorContractionSycl.h"
 
 #endif  // end of EIGEN_USE_SYCL
 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
index ee8f3c9..8729c86 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
@@ -48,9 +48,9 @@ struct DeviceConvertor{
 /// specialisation of the \ref ConvertToDeviceExpression struct when the node
 /// type is TensorMap
 #define TENSORMAPCONVERT(CVQual)\
-template <typename T,  int Options_, template <class> class MakePointer_>\
-struct ConvertToDeviceExpression<CVQual TensorMap<T, Options_, MakePointer_> > {\
-  typedef CVQual TensorMap<T, Options_, MakeGlobalPointer> Type;\
+template <typename Scalar_, int Options_, int Options2_, int NumIndices_, typename IndexType_, template <class> class MakePointer_>\
+struct ConvertToDeviceExpression<CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakePointer_> > {\
+  typedef CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakeGlobalPointer> Type;\
 };
 
 TENSORMAPCONVERT(const)
@@ -97,18 +97,8 @@ template <typename Expr>\
 struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > \
 : DeviceConvertor<ExprNode, Res, Expr>{};
 
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorForcedEvalOp
-#define KERNELBROKERCONVERTFORCEDEVAL(CVQual)\
-template <typename Expr>\
-struct ConvertToDeviceExpression<CVQual TensorForcedEvalOp<Expr> > {\
-  typedef CVQual TensorForcedEvalOp< typename ConvertToDeviceExpression<Expr>::Type> Type;\
-};
-KERNELBROKERCONVERTFORCEDEVAL(const)
-KERNELBROKERCONVERTFORCEDEVAL()
-#undef KERNELBROKERCONVERTFORCEDEVAL
-
-
-
+KERNELBROKERCONVERT(const, true, TensorForcedEvalOp)
+KERNELBROKERCONVERT(, false, TensorForcedEvalOp)
 KERNELBROKERCONVERT(const, true, TensorEvalToOp)
 KERNELBROKERCONVERT(, false, TensorEvalToOp)
 #undef KERNELBROKERCONVERT
@@ -124,40 +114,6 @@ KERNELBROKERCONVERTREDUCTION(const)
 KERNELBROKERCONVERTREDUCTION()
 #undef KERNELBROKERCONVERTREDUCTION
 
-#define KERNELBROKERCONVERTSLICEOP(CVQual)\
-template<typename StartIndices, typename Sizes, typename XprType>\
-struct ConvertToDeviceExpression<CVQual TensorSlicingOp <StartIndices, Sizes, XprType> >{\
-  typedef CVQual TensorSlicingOp<StartIndices, Sizes, typename ConvertToDeviceExpression<XprType>::Type> Type;\
-};
-
-KERNELBROKERCONVERTSLICEOP(const)
-KERNELBROKERCONVERTSLICEOP()
-#undef KERNELBROKERCONVERTSLICEOP
-
-
-#define KERNELBROKERCONVERTERSLICESTRIDEOP(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>\
-struct ConvertToDeviceExpression<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >{\
-  typedef CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, typename ConvertToDeviceExpression<XprType>::Type> Type;\
-};
-
-KERNELBROKERCONVERTERSLICESTRIDEOP(const)
-KERNELBROKERCONVERTERSLICESTRIDEOP()
-#undef KERNELBROKERCONVERTERSLICESTRIDEOP
-
-
-/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorChippingOp
-#define KERNELBROKERCONVERTCHIPPINGOP(CVQual)\
-template <DenseIndex DimId, typename Expr>\
-struct ConvertToDeviceExpression<CVQual TensorChippingOp<DimId, Expr> > {\
-  typedef CVQual TensorChippingOp<DimId, typename ConvertToDeviceExpression<Expr>::Type> Type;\
-};
-KERNELBROKERCONVERTCHIPPINGOP(const)
-KERNELBROKERCONVERTCHIPPINGOP()
-#undef KERNELBROKERCONVERTCHIPPINGOP
-
-
-
 }  // namespace internal
 }  // namespace TensorSycl
 }  // namespace Eigen
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
index 3b83b1d..7ed3a3a 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
@@ -25,21 +25,12 @@
 namespace Eigen {
 namespace TensorSycl {
 namespace internal {
-
-template <typename Expr, typename Dims>
-struct DeviceFixedSizeTensor;
-
-template <typename Expr, typename std::ptrdiff_t... Indices>
-struct DeviceFixedSizeTensor<Expr, Eigen::Sizes<Indices...>>{
-  template<typename Data>
-  static EIGEN_ALWAYS_INLINE Expr instantiate(Data& dt) {return Expr(ConvertToActualTypeSycl(typename Expr::Scalar, dt), Indices...);}
-};
 /// this class is used by EvalToOp in order to create an lhs expression which is
 /// a pointer from an accessor on device-only buffer
 template <typename PtrType, size_t N, typename... Params>
 struct EvalToLHSConstructor {
   PtrType expr;
-  EvalToLHSConstructor(const utility::tuple::Tuple<Params...> &t) : expr(ConvertToActualTypeSycl(typename Eigen::internal::remove_all<PtrType>::type, utility::tuple::get<N>(t))) {}
+  EvalToLHSConstructor(const utility::tuple::Tuple<Params...> &t): expr((&(*(utility::tuple::get<N>(t).get_pointer())))) {}
 };
 
 /// \struct ExprConstructor is used to reconstruct the expression on the device and
@@ -54,39 +45,21 @@ struct ExprConstructor;
 /// specialisation of the \ref ExprConstructor struct when the node type is
 /// TensorMap
 #define TENSORMAP(CVQual)\
-template <typename T,  int Options_,\
+template <typename Scalar_, int Options_, int Options2_, int Options3_, int NumIndices_, typename IndexType_,\
 template <class> class MakePointer_, size_t N, typename... Params>\
-struct ExprConstructor< CVQual TensorMap<T, Options_, MakeGlobalPointer>,\
-CVQual PlaceHolder<CVQual TensorMap<T, Options_, MakePointer_>, N>, Params...>{\
-  typedef  CVQual TensorMap<T, Options_, MakeGlobalPointer>  Type;\
+struct ExprConstructor< CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakeGlobalPointer>,\
+CVQual PlaceHolder<CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options3_, MakePointer_>, N>, Params...>{\
+  typedef  CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakeGlobalPointer>  Type;\
   Type expr;\
   template <typename FuncDetector>\
   ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
-  : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())){}\
+  : expr(Type((&(*(utility::tuple::get<N>(t).get_pointer()))), fd.dimensions())) {}\
 };
 
-
 TENSORMAP(const)
 TENSORMAP()
 #undef TENSORMAP
 
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorMap
-#define TENSORMAPFIXEDSIZE(CVQual)\
-template <typename Scalar_, typename Dimensions_, int Options_2, typename IndexType, int Options_,\
-template <class> class MakePointer_, size_t N, typename... Params>\
-struct ExprConstructor< CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakeGlobalPointer>,\
-CVQual PlaceHolder<CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_>, N>, Params...>{\
-  typedef  CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakeGlobalPointer>  Type;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &, const utility::tuple::Tuple<Params...> &t)\
-  : expr(DeviceFixedSizeTensor<Type,Dimensions_>::instantiate(utility::tuple::get<N>(t))){}\
-};
-TENSORMAPFIXEDSIZE(const)
-TENSORMAPFIXEDSIZE()
-#undef TENSORMAPFIXEDSIZE
-
 #define UNARYCATEGORY(CVQual)\
 template <template<class, class> class UnaryCategory, typename OP, typename OrigRHSExpr, typename RHSExpr, typename... Params>\
 struct ExprConstructor<CVQual UnaryCategory<OP, OrigRHSExpr>, CVQual UnaryCategory<OP, RHSExpr>, Params...> {\
@@ -188,30 +161,8 @@ struct ExprConstructor<CVQual TensorAssignOp<OrigLHSExpr, OrigRHSExpr>,  CVQual
  ASSIGN(const)
  ASSIGN()
  #undef ASSIGN
-
-
-
-
- /// specialisation of the \ref ExprConstructor struct when the node type is
- /// const TensorAssignOp
- #define CONVERSIONEXPRCONST(CVQual)\
- template <typename OrigNestedExpr, typename ConvertType, typename NestedExpr, typename... Params>\
- struct ExprConstructor<CVQual TensorConversionOp<ConvertType, OrigNestedExpr>,  CVQual TensorConversionOp<ConvertType, NestedExpr>, Params...> {\
-   typedef ExprConstructor<OrigNestedExpr, NestedExpr, Params...> my_nested_type;\
-   typedef CVQual TensorConversionOp<ConvertType, typename my_nested_type::Type>  Type;\
-   my_nested_type nestedExpr;\
-   Type expr;\
-   template <typename FuncDetector>\
-   ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-   : nestedExpr(funcD.subExpr, t), expr(nestedExpr.expr) {}\
-  };
-
-  CONVERSIONEXPRCONST(const)
-  CONVERSIONEXPRCONST()
-  #undef CONVERSIONEXPRCONST
-
 /// specialisation of the \ref ExprConstructor struct when the node type is
-///  TensorEvalToOp /// 0 here is the output number in the buffer
+///  TensorEvalToOp
 #define EVALTO(CVQual)\
 template <typename OrigExpr, typename Expr, typename... Params>\
 struct ExprConstructor<CVQual TensorEvalToOp<OrigExpr, MakeGlobalPointer>, CVQual TensorEvalToOp<Expr>, Params...> {\
@@ -234,14 +185,14 @@ EVALTO()
 /// TensorForcedEvalOp
 #define FORCEDEVAL(CVQual)\
 template <typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
-struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr>,\
+struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr, MakeGlobalPointer>,\
 CVQual PlaceHolder<CVQual TensorForcedEvalOp<DevExpr>, N>, Params...> {\
-  typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr>::Scalar,\
-  TensorForcedEvalOp<DevExpr>::NumDimensions, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, typename TensorForcedEvalOp<DevExpr>::Index>, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, MakeGlobalPointer> Type;\
+  typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::Scalar,\
+  TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::NumDimensions, 0, typename TensorForcedEvalOp<DevExpr>::Index>, 0, MakeGlobalPointer> Type;\
   Type expr;\
   template <typename FuncDetector>\
   ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
-  : expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
+  : expr(Type((&(*(utility::tuple::get<N>(t).get_pointer()))), fd.dimensions())) {}\
 };
 
 FORCEDEVAL(const)
@@ -262,130 +213,17 @@ struct ExprConstructor<CVQual TensorReductionOp<OP, Dim, OrigExpr, MakeGlobalPoi
 CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dim, DevExpr>, N>, Params...> {\
   static const size_t NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0,  1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\
   typedef CVQual TensorMap<Tensor<typename TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::Scalar,\
-  NumIndices, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, typename TensorReductionOp<OP, Dim, DevExpr>::Index>, Eigen::internal::traits<TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>>::Layout, MakeGlobalPointer> Type;\
+  NumIndices, 0, typename TensorReductionOp<OP, Dim, DevExpr>::Index>, 0, MakeGlobalPointer> Type;\
   Type expr;\
   template <typename FuncDetector>\
   ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
-  :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
+  : expr(Type((&(*(utility::tuple::get<N>(t).get_pointer()))), fd.dimensions())) {}\
 };
 
 SYCLREDUCTIONEXPR(const)
 SYCLREDUCTIONEXPR()
 #undef SYCLREDUCTIONEXPR
 
-
-/// specialisation of the \ref ExprConstructor struct when the node type is
-/// TensorContractionOp
-#define SYCLCONTRACTIONCONVOLUTION(CVQual, ExprNode)\
-template <typename Indices, typename OrigLhsXprType, typename OrigRhsXprType, typename LhsXprType, typename RhsXprType, size_t N, typename... Params>\
-struct ExprConstructor<CVQual ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>,\
-CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType,  RhsXprType>, N>, Params...> {\
-  static const size_t NumIndices= Eigen::internal::traits<ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> >::NumDimensions;\
-  typedef CVQual TensorMap<Tensor<typename ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>::Scalar,\
-  NumIndices, Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType> >::Layout,\
-  typename ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>::Index>,\
-  Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>>::Layout, MakeGlobalPointer> Type;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
-  :expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
-};
-
-SYCLCONTRACTIONCONVOLUTION(const, TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTION(, TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTION(const, TensorConvolutionOp)
-SYCLCONTRACTIONCONVOLUTION(, TensorConvolutionOp)
-#undef SYCLCONTRACTIONCONVOLUTION
-
-
-
-#define SYCLSLICEOPEXPR(CVQual)\
-template<typename StartIndices, typename Sizes, typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual TensorSlicingOp <StartIndices, Sizes, OrigXprType> , CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Params... >{\
-  typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
-  typedef CVQual TensorSlicingOp<StartIndices, Sizes, typename my_xpr_type::Type> Type;\
-  my_xpr_type xprExpr;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-  : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.startIndices(), funcD.dimensions()) {}\
-};
-
-SYCLSLICEOPEXPR(const)
-SYCLSLICEOPEXPR()
-#undef SYCLSLICEOPEXPR
-
-
-#define SYCLSLICESTRIDEOPEXPR(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, OrigXprType>, CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Params... >{\
-  typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
-  typedef CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, typename my_xpr_type::Type> Type;\
-  my_xpr_type xprExpr;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-  : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.startIndices(), funcD.stopIndices(),funcD.strides()) {}\
-};
-
-SYCLSLICESTRIDEOPEXPR(const)
-SYCLSLICESTRIDEOPEXPR()
-#undef SYCLSLICESTRIDEOPEXPR
-
-#define SYCLRESHAPEANDSHUFFLEOPEXPRCONST(OPEXPR, CVQual)\
-template<typename Param, typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\
-  typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
-  typedef CVQual OPEXPR <Param, typename my_xpr_type::Type> Type ;\
-  my_xpr_type xprExpr;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-  : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param()) {}\
-};
-
-SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, const)
-SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorReshapingOp, )
-
-SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, const)
-SYCLRESHAPEANDSHUFFLEOPEXPRCONST(TensorShufflingOp, )
-#undef SYCLRESHAPEANDSHUFFLEOPEXPRCONST
-
-#define SYCLPADDINGOPEXPRCONST(OPEXPR, CVQual)\
-template<typename Param, typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual OPEXPR <Param, OrigXprType> , CVQual OPEXPR <Param, XprType>, Params... >{\
-  typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
-  typedef CVQual OPEXPR <Param, typename my_xpr_type::Type> Type ;\
-  my_xpr_type xprExpr;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-  : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.param() , funcD.scalar_param()) {}\
-};
-
-SYCLPADDINGOPEXPRCONST(TensorPaddingOp, const)
-SYCLPADDINGOPEXPRCONST(TensorPaddingOp, )
-#undef SYCLPADDINGOPEXPRCONST
-
-
-// TensorChippingOp
-#define SYCLTENSORCHIPPINGOPEXPR(CVQual)\
-template<DenseIndex DimId, typename OrigXprType, typename XprType, typename... Params>\
-struct ExprConstructor<CVQual TensorChippingOp <DimId, OrigXprType> , CVQual TensorChippingOp<DimId, XprType>, Params... >{\
-  typedef ExprConstructor<OrigXprType, XprType, Params...> my_xpr_type;\
-  typedef CVQual TensorChippingOp<DimId, typename my_xpr_type::Type> Type;\
-  my_xpr_type xprExpr;\
-  Type expr;\
-  template <typename FuncDetector>\
-  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
-  : xprExpr(funcD.xprExpr, t), expr(xprExpr.expr, funcD.offset(), funcD.dimId()) {}\
-};
-
-SYCLTENSORCHIPPINGOPEXPR(const)
-SYCLTENSORCHIPPINGOPEXPR()
-#undef SYCLTENSORCHIPPINGOPEXPR
-
-
 /// template deduction for \ref ExprConstructor struct
 template <typename OrigExpr, typename IndexExpr, typename FuncD, typename... Params>
 auto createDeviceExpression(FuncD &funcD, const utility::tuple::Tuple<Params...> &t)
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
index b512d43..b1da685 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
@@ -35,8 +35,6 @@
 namespace Eigen {
 namespace TensorSycl {
 namespace internal {
-#define RETURN_CPP11(expr) ->decltype(expr) {return expr;}
-
 /// \struct ExtractAccessor: Extract Accessor Class is used to extract the
 /// accessor from a buffer.
 /// Depending on the type of the leaf node we can get a read accessor or a
@@ -45,192 +43,159 @@ template <typename Evaluator>
 struct ExtractAccessor;
 
 struct AccessorConstructor{
-  template<typename Arg> static inline auto getTuple(cl::sycl::handler& cgh, const Arg& eval)
-  RETURN_CPP11(ExtractAccessor<Arg>::getTuple(cgh, eval))
-
-  template<typename Arg1, typename Arg2> static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1, const Arg2& eval2)
-  RETURN_CPP11(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2)))
-
-  template<typename Arg1, typename Arg2, typename Arg3>	static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1 , const Arg2& eval2 , const Arg3& eval3)
-  RETURN_CPP11(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3))))
-
-  template< cl::sycl::access::mode AcM, typename Arg> static inline auto getAccessor(cl::sycl::handler& cgh, const Arg& eval)
-  RETURN_CPP11(utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM>(cgh,eval.data())))
+  template<typename Arg> static inline auto getTuple(cl::sycl::handler& cgh, Arg eval)
+  -> decltype(ExtractAccessor<Arg>::getTuple(cgh, eval)) {
+  return ExtractAccessor<Arg>::getTuple(cgh, eval);
+  }
+
+  template<typename Arg1, typename Arg2> static inline auto getTuple(cl::sycl::handler& cgh, Arg1 eval1, Arg2 eval2)
+  -> decltype(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2))) {
+    return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2));
+  }
+  template<typename Arg1, typename Arg2, typename Arg3>	static inline auto getTuple(cl::sycl::handler& cgh, Arg1 eval1 , Arg2 eval2 , Arg3 eval3)
+  -> decltype(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)))) {
+    return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)));
+  }
+  template< cl::sycl::access::mode AcM, typename Arg> static inline auto getAccessor(cl::sycl::handler& cgh, Arg eval)
+  -> decltype(utility::tuple::make_tuple( eval.device().template get_sycl_accessor<AcM,
+  typename Eigen::internal::remove_all<typename Arg::CoeffReturnType>::type>(eval.dimensions().TotalSize(), cgh,eval.data()))){
+    return utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM, typename Eigen::internal::remove_all<typename Arg::CoeffReturnType>::type>(eval.dimensions().TotalSize(), cgh,eval.data()));
+  }
 };
 
 /// specialisation of the \ref ExtractAccessor struct when the node type is
-///  TensorCwiseNullaryOp,  TensorCwiseUnaryOp and  TensorBroadcastingOp
-#define SYCLUNARYCATEGORYEXTACC(CVQual)\
-template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev> > {\
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev>& eval)\
-RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
+/// const TensorCwiseNullaryOp, const TensorCwiseUnaryOp and const TensorBroadcastingOp
+template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> > {
+  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> eval)
+  -> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){
+    return AccessorConstructor::getTuple(cgh, eval.impl());
+  }
 };
 
-SYCLUNARYCATEGORYEXTACC(const)
-SYCLUNARYCATEGORYEXTACC()
-#undef SYCLUNARYCATEGORYEXTACC
-
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseBinaryOp
-#define SYCLBINARYCATEGORYEXTACC(CVQual)\
-template <template<class, class, class> class BinaryCategory, typename OP,  typename LHSExpr, typename RHSExpr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {\
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& eval)\
-  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseNullaryOp,  TensorCwiseUnaryOp and  TensorBroadcastingOp
+template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<UnaryCategory<OP, RHSExpr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> > {};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorCwiseBinaryOp
+template <template<class, class, class> class BinaryCategory, typename OP,  typename LHSExpr, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {
+  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> eval)
+  -> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){
+    return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl());
+  }
 };
-
-SYCLBINARYCATEGORYEXTACC(const)
-SYCLBINARYCATEGORYEXTACC()
-#undef SYCLBINARYCATEGORYEXTACC
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseBinaryOp
+template <template<class, class, class> class BinaryCategory, typename OP,  typename LHSExpr, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> >{};
 
 /// specialisation of the \ref ExtractAccessor struct when the node type is
 /// const TensorCwiseTernaryOp
-#define SYCLTERNARYCATEGORYEXTACC(CVQual)\
-template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {\
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& eval)\
-  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl()))\
+template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {
+  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> eval)
+  -> decltype(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl())){
+    return AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl());
+  }
 };
 
-SYCLTERNARYCATEGORYEXTACC(const)
-SYCLTERNARYCATEGORYEXTACC()
-#undef SYCLTERNARYCATEGORYEXTACC
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseTernaryOp
+template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >{};
 
+/// specialisation of the \ref ExtractAccessor struct when the node type is
+/// const TensorCwiseSelectOp. This is a special case where there is no OP
+template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {
+  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> eval)
+  -> decltype(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl())){
+    return AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl());
+  }
+};
 
 /// specialisation of the \ref ExtractAccessor struct when the node type is
 /// TensorCwiseSelectOp. This is a special case where there is no OP
-#define SYCLSELECTOPEXTACC(CVQual)\
-template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {\
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& eval)\
-  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl()))\
+template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> >{};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorAssignOp
+template <typename LHSExpr, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {
+  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> eval)
+  -> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){
+    return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl());
+ }
 };
 
-SYCLSELECTOPEXTACC(const)
-SYCLSELECTOPEXTACC()
-#undef SYCLSELECTOPEXTACC
-
 /// specialisation of the \ref ExtractAccessor struct when the node type is TensorAssignOp
-#define SYCLTENSORASSIGNOPEXTACC(CVQual)\
-template <typename LHSExpr, typename RHSExpr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {\
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev>& eval)\
-  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\
-};
-
- SYCLTENSORASSIGNOPEXTACC(const)
- SYCLTENSORASSIGNOPEXTACC()
- #undef SYCLTENSORASSIGNOPEXTACC
+template <typename LHSExpr, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TensorAssignOp<LHSExpr, RHSExpr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> >{};
 
 /// specialisation of the \ref ExtractAccessor struct when the node type is const TensorMap
 #define TENSORMAPEXPR(CVQual, ACCType)\
 template <typename PlainObjectType, int Options_, typename Dev>\
 struct ExtractAccessor<TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev> > {\
-  static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev>& eval)\
-  RETURN_CPP11(AccessorConstructor::template getAccessor<ACCType>(cgh, eval))\
+  static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev> eval)\
+  -> decltype(AccessorConstructor::template getAccessor<ACCType>(cgh, eval)){\
+    return AccessorConstructor::template getAccessor<ACCType>(cgh, eval);\
+  }\
 };
-
 TENSORMAPEXPR(const, cl::sycl::access::mode::read)
 TENSORMAPEXPR(, cl::sycl::access::mode::read_write)
 #undef TENSORMAPEXPR
 
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorForcedEvalOp
-#define SYCLFORCEDEVALEXTACC(CVQual)\
-template <typename Expr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorForcedEvalOp<Expr>, Dev> > {\
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorForcedEvalOp<Expr>, Dev>& eval)\
-  RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorForcedEvalOp
+template <typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> > {
+  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> eval)
+  -> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){
+    return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);
+  }
 };
 
-SYCLFORCEDEVALEXTACC(const)
-SYCLFORCEDEVALEXTACC()
-#undef SYCLFORCEDEVALEXTACC
-
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorForcedEvalOp
+template <typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TensorForcedEvalOp<Expr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> >{};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorEvalToOp
+template <typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TensorEvalToOp<Expr>, Dev> > {
+  static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<const TensorEvalToOp<Expr>, Dev> eval)
+  -> decltype(utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()))){
+    return utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()));
+  }
+};
 
 /// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp
-#define SYCLEVALTOEXTACC(CVQual)\
-template <typename Expr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorEvalToOp<Expr>, Dev> > {\
-  static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorEvalToOp<Expr>, Dev>& eval)\
-  RETURN_CPP11(utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl())))\
+template <typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TensorEvalToOp<Expr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TensorEvalToOp<Expr>, Dev> >{};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorReductionOp
+template <typename OP, typename Dim, typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> > {
+  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> eval)
+  -> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){
+    return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);
+  }
 };
 
-SYCLEVALTOEXTACC(const)
-SYCLEVALTOEXTACC()
-#undef SYCLEVALTOEXTACC
-
 /// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp
-#define SYCLREDUCTIONEXTACC(CVQual)\
-template <typename OP, typename Dim, typename Expr, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev> > {\
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev>& eval)\
-  RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
-};
-
-SYCLREDUCTIONEXTACC(const)
-SYCLREDUCTIONEXTACC()
-#undef SYCLREDUCTIONEXTACC
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is TensorContractionOp and TensorConvolutionOp
-#define SYCLCONTRACTIONCONVOLUTIONEXTACC(CVQual, ExprNode)\
-template<typename Indices, typename LhsXprType, typename RhsXprType, typename Dev>\
- struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev> > {\
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev>& eval)\
-  RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
-};
-
-SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorConvolutionOp)
-SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorConvolutionOp)
-#undef SYCLCONTRACTIONCONVOLUTIONEXTACC
-
-
-/// specialisation of the \ref ExtractAccessor struct when the node type is
-/// const TensorSlicingOp.
-#define SYCLSLICEOPEXTACC(CVQual)\
-template <typename StartIndices, typename Sizes, typename XprType, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev> > {\
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev>& eval)\
-  RETURN_CPP11( AccessorConstructor::getTuple(cgh, eval.impl()))\
-};
-
-SYCLSLICEOPEXTACC(const)
-SYCLSLICEOPEXTACC()
-#undef SYCLSLICEOPEXTACC
-// specialisation of the \ref ExtractAccessor struct when the node type is
-///  TensorStridingSlicingOp.
-#define SYCLSLICESTRIDEOPEXTACC(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev> >{\
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev>& eval)\
-  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
-};
-
-SYCLSLICESTRIDEOPEXTACC(const)
-SYCLSLICESTRIDEOPEXTACC()
-#undef SYCLSLICESTRIDEOPEXTACC
-
-// specialisation of the \ref ExtractAccessor struct when the node type is
-/// TensorChippingOp.
-#define SYCLTENSORCHIPPINGOPEXTACC(CVQual)\
-template<DenseIndex DimId, typename XprType, typename Dev>\
-struct ExtractAccessor<TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Dev> >{\
-  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Dev>& eval)\
-  RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
-};
-
-SYCLTENSORCHIPPINGOPEXTACC(const)
-SYCLTENSORCHIPPINGOPEXTACC()
-#undef SYCLTENSORCHIPPINGOPEXTACC
-
+template <typename OP, typename Dim, typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TensorReductionOp<OP, Dim, Expr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> >{};
 
 /// template deduction for \ref ExtractAccessor
 template <typename Evaluator>
-auto createTupleOfAccessors(cl::sycl::handler& cgh, const Evaluator& eval)
--> decltype(ExtractAccessor<Evaluator>::getTuple(cgh, eval)) {
-  return ExtractAccessor<Evaluator>::getTuple(cgh, eval);
+auto createTupleOfAccessors(cl::sycl::handler& cgh, const Evaluator& expr)
+-> decltype(ExtractAccessor<Evaluator>::getTuple(cgh, expr)) {
+  return ExtractAccessor<Evaluator>::getTuple(cgh, expr);
 }
 
 } /// namespace TensorSycl
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
index ee02018..4271253 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
@@ -36,277 +36,135 @@ namespace internal {
 template <typename Evaluator> struct FunctorExtractor{
   typedef typename Evaluator::Dimensions Dimensions;
   const Dimensions m_dimensions;
-  EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+  const Dimensions& dimensions() const { return m_dimensions; }
   FunctorExtractor(const Evaluator& expr)
   : m_dimensions(expr.dimensions()) {}
 
 };
 
-/// specialisation of the \ref FunctorExtractor struct when the node type does not require anything
-///TensorConversionOp
-#define SYCLEXTRFUNCCONVERSION(ExprNode, CVQual)\
-template <typename ArgType1, typename ArgType2, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<ArgType1, ArgType2>, Dev> > {\
-  FunctorExtractor<TensorEvaluator<ArgType2, Dev> > subExpr;\
-  FunctorExtractor(const TensorEvaluator<CVQual ExprNode<ArgType1, ArgType2>, Dev>& expr)\
-  : subExpr(expr.impl()) {}\
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// const TensorCwiseNullaryOp, const TensorCwiseUnaryOp, and const TensorBroadcastingOp
+template <template <class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> > {
+  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
+  OP func;
+  FunctorExtractor(const TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev>& expr)
+  : rhsExpr(expr.impl()), func(expr.functor()) {}
 };
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// TensorCwiseNullaryOp, TensorCwiseUnaryOp, and TensorBroadcastingOp
+template <template <class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<UnaryCategory<OP, RHSExpr>, Dev> >
+: FunctorExtractor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> >{};
 
-SYCLEXTRFUNCCONVERSION(TensorConversionOp, const)
-SYCLEXTRFUNCCONVERSION(TensorConversionOp, )
-#undef SYCLEXTRFUNCCONVERSION
-
-#define SYCLEXTRTENSORMAPFIXEDSIZE(CVQual)\
-template <typename Scalar_, typename Dimensions_, int Options_2, typename IndexType, int Options_, template <class> class MakePointer_, typename Dev>\
-struct FunctorExtractor< TensorEvaluator <CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_> , Dev> >{\
-FunctorExtractor(const TensorEvaluator <CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_> , Dev>& ){}\
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// const TensorCwiseBinaryOp
+template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {
+  FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;
+  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
+  OP func;
+  FunctorExtractor(const TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& expr)
+  : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.functor()) {}
 };
 
-SYCLEXTRTENSORMAPFIXEDSIZE(const)
-SYCLEXTRTENSORMAPFIXEDSIZE()
-#undef SYCLEXTRTENSORMAPFIXEDSIZE
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// const TensorCwiseBinaryOp
+template <template <class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<BinaryCategory<OP,  LHSExpr, RHSExpr>, Dev> >
+: FunctorExtractor<TensorEvaluator<const BinaryCategory<OP,  LHSExpr, RHSExpr>, Dev> >{};
 
 /// specialisation of the \ref FunctorExtractor struct when the node type is
-/// TensorCwiseNullaryOp,  TensorCwiseUnaryOp, and  TensorBroadcastingOp
-#define SYCLEXTRFUNCUNARY(CVQual)\
-template <template <class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev> > {\
-  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
-  const OP func;\
-  FunctorExtractor(const TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev>& expr)\
-  : rhsExpr(expr.impl()), func(expr.functor()) {}\
+/// const TensorCwiseTernaryOp
+template <template <class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr,typename Dev>
+struct FunctorExtractor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {
+  FunctorExtractor<TensorEvaluator<Arg1Expr, Dev> > arg1Expr;
+  FunctorExtractor<TensorEvaluator<Arg2Expr, Dev> > arg2Expr;
+  FunctorExtractor<TensorEvaluator<Arg3Expr, Dev> > arg3Expr;
+  OP func;
+  FunctorExtractor(const TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& expr)
+  : arg1Expr(expr.arg1Impl()), arg2Expr(expr.arg2Impl()), arg3Expr(expr.arg3Impl()), func(expr.functor()) {}
 };
 
-SYCLEXTRFUNCUNARY(const)
-SYCLEXTRFUNCUNARY()
-#undef SYCLEXTRFUNCUNARY
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// TensorCwiseTernaryOp
+template <template <class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>
+struct FunctorExtractor<TensorEvaluator< TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >
+:FunctorExtractor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >{};
 
 /// specialisation of the \ref FunctorExtractor struct when the node type is
-/// TensorCwiseBinaryOp
-#define SYCLEXTRFUNCBIINARY(CVQual)\
-template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {\
-  FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\
-  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
-  const OP func;\
-  FunctorExtractor(const TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& expr)\
-  : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.functor()) {}\
+/// const TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated.
+template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
+struct FunctorExtractor< TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {
+  FunctorExtractor<TensorEvaluator<IfExpr, Dev> > ifExpr;
+  FunctorExtractor<TensorEvaluator<ThenExpr, Dev> > thenExpr;
+  FunctorExtractor<TensorEvaluator<ElseExpr, Dev> > elseExpr;
+  FunctorExtractor(const TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& expr)
+  : ifExpr(expr.cond_impl()), thenExpr(expr.then_impl()), elseExpr(expr.else_impl()) {}
 };
 
-SYCLEXTRFUNCBIINARY(const)
-SYCLEXTRFUNCBIINARY()
-#undef SYCLEXTRFUNCBIINARY
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated
+template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> >
+:FunctorExtractor< TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {};
 
-/// specialisation of the \ref FunctorExtractor struct when the node type is TensorCwiseTernaryOp
-#define SYCLEXTRFUNCTERNARY(CVQual)\
-template <template <class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr,typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {\
-  FunctorExtractor<TensorEvaluator<Arg1Expr, Dev> > arg1Expr;\
-  FunctorExtractor<TensorEvaluator<Arg2Expr, Dev> > arg2Expr;\
-  FunctorExtractor<TensorEvaluator<Arg3Expr, Dev> > arg3Expr;\
-  const OP func;\
-  FunctorExtractor(const TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& expr)\
-  : arg1Expr(expr.arg1Impl()), arg2Expr(expr.arg2Impl()), arg3Expr(expr.arg3Impl()), func(expr.functor()) {}\
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// const TensorAssignOp. This is an specialisation without OP so it has to be separated.
+template <typename LHSExpr, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {
+  FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;
+  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
+  FunctorExtractor(const TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev>& expr)
+  : lhsExpr(expr.left_impl()), rhsExpr(expr.right_impl()) {}
 };
 
-SYCLEXTRFUNCTERNARY(const)
-SYCLEXTRFUNCTERNARY()
-#undef SYCLEXTRFUNCTERNARY
-
 /// specialisation of the \ref FunctorExtractor struct when the node type is
-/// TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated.
-#define SYCLEXTRFUNCSELECTOP(CVQual)\
-template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>\
-struct FunctorExtractor< TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {\
-  FunctorExtractor<TensorEvaluator<IfExpr, Dev> > ifExpr;\
-  FunctorExtractor<TensorEvaluator<ThenExpr, Dev> > thenExpr;\
-  FunctorExtractor<TensorEvaluator<ElseExpr, Dev> > elseExpr;\
-  FunctorExtractor(const TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& expr)\
-  : ifExpr(expr.cond_impl()), thenExpr(expr.then_impl()), elseExpr(expr.else_impl()) {}\
-};
+/// TensorAssignOp. This is an specialisation without OP so it has to be separated.
+template <typename LHSExpr, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<TensorAssignOp<LHSExpr, RHSExpr>, Dev> >
+:FunctorExtractor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> >{};
 
-SYCLEXTRFUNCSELECTOP(const)
-SYCLEXTRFUNCSELECTOP()
-#undef SYCLEXTRFUNCSELECTOP
 
 /// specialisation of the \ref FunctorExtractor struct when the node type is
-/// const TensorAssignOp. This is an specialisation without OP so it has to be separated.
-#define SYCLEXTRFUNCASSIGNOP(CVQual)\
-template <typename LHSExpr, typename RHSExpr, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {\
-  FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\
-  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
-  FunctorExtractor(const TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev>& expr)\
-  : lhsExpr(expr.left_impl()), rhsExpr(expr.right_impl()) {}\
+/// const TensorEvalToOp, This is an specialisation without OP so it has to be separated.
+template <typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<const TensorEvalToOp<RHSExpr>, Dev> > {
+  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
+  FunctorExtractor(const TensorEvaluator<const TensorEvalToOp<RHSExpr>, Dev>& expr)
+  : rhsExpr(expr.impl()) {}
 };
-SYCLEXTRFUNCASSIGNOP(const)
-SYCLEXTRFUNCASSIGNOP()
-#undef SYCLEXTRFUNCASSIGNOP
 
 /// specialisation of the \ref FunctorExtractor struct when the node type is
-/// TensorEvalToOp, This is an specialisation without OP so it has to be separated.
-#define SYCLEXTRFUNCEVALTOOP(CVQual)\
-template <typename RHSExpr, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorEvalToOp<RHSExpr>, Dev> > {\
-  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
-  FunctorExtractor(const TensorEvaluator<CVQual TensorEvalToOp<RHSExpr>, Dev>& expr)\
-  : rhsExpr(expr.impl()) {}\
-};
-
-SYCLEXTRFUNCEVALTOOP(const)
-SYCLEXTRFUNCEVALTOOP()
-#undef SYCLEXTRFUNCEVALTOOP
+/// TensorEvalToOp. This is a specialisation without OP so it has to be separated.
+template <typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<TensorEvalToOp<RHSExpr>, Dev> >
+: FunctorExtractor<TensorEvaluator<const TensorEvalToOp<RHSExpr>, Dev> > {};
 
 template<typename Dim, size_t NumOutputDim> struct DimConstr {
 template<typename InDim>
-  static EIGEN_STRONG_INLINE Dim getDim(InDim dims ) {return dims;}
+  static inline Dim getDim(InDim dims ) {return dims;}
 };
 
 template<typename Dim> struct DimConstr<Dim, 0> {
   template<typename InDim>
-    static EIGEN_STRONG_INLINE Dim getDim(InDim dims ) {return Dim(static_cast<Dim>(dims.TotalSize()));}
-};
-
-#define SYCLEXTRFUNCREDUCTIONOP(CVQual)\
-template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{\
-  typedef TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Evaluator;\
-  typedef typename Eigen::internal::conditional<Evaluator::NumOutputDims==0, DSizes<typename Evaluator::Index, 1>, typename Evaluator::Dimensions >::type Dimensions;\
-  const Dimensions m_dimensions;\
-  EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
-  FunctorExtractor(const TensorEvaluator<CVQual TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>& expr)\
-  : m_dimensions(DimConstr<Dimensions, Evaluator::NumOutputDims>::getDim(expr.dimensions())) {}\
-};
-
-
-SYCLEXTRFUNCREDUCTIONOP(const)
-SYCLEXTRFUNCREDUCTIONOP()
-#undef SYCLEXTRFUNCREDUCTIONOP
-
-#define SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(CVQual, ExprNode)\
-template<typename Indices, typename LhsXprType, typename RhsXprType, typename Device>\
-struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>>{\
-  typedef TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device> Evaluator;\
-  typedef typename Evaluator::Dimensions Dimensions;\
-  const Dimensions m_dimensions;\
-  EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
-  FunctorExtractor(const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>& expr)\
-  : m_dimensions(expr.dimensions()) {}\
+    static inline Dim getDim(InDim dims ) {return Dim(dims.TotalSize());}
 };
 
-
-SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorContractionOp)
-SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorContractionOp)
-SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorConvolutionOp)
-SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorConvolutionOp)
-#undef SYCLEXTRFUNCCONTRACTCONVOLUTIONOP
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is
-/// const TensorSlicingOp. This is an specialisation without OP so it has to be separated.
-#define SYCLEXTRFUNCTSLICEOP(CVQual)\
-template <typename StartIndices, typename Sizes, typename XprType, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev> > {\
-  FunctorExtractor<TensorEvaluator<XprType, Dev> > xprExpr;\
-  const StartIndices m_offsets;\
-  const Sizes m_dimensions;\
-  FunctorExtractor(const TensorEvaluator<CVQual  TensorSlicingOp<StartIndices, Sizes, XprType>, Dev>& expr)\
-  : xprExpr(expr.impl()), m_offsets(expr.startIndices()), m_dimensions(expr.dimensions()) {}\
-  EIGEN_STRONG_INLINE const StartIndices& startIndices() const {return m_offsets;}\
-  EIGEN_STRONG_INLINE const Sizes& dimensions() const {return m_dimensions;}\
-};
-
-SYCLEXTRFUNCTSLICEOP(const)
-SYCLEXTRFUNCTSLICEOP()
-#undef SYCLEXTRFUNCTSLICEOP
-
-#define SYCLEXTRFUNCTSLICESTRIDEOP(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev> >{\
-  FunctorExtractor<TensorEvaluator<XprType, Dev> > xprExpr;\
-  const StartIndices m_startIndices;\
-  const StopIndices m_stopIndices;\
-  const Strides m_strides;\
-  FunctorExtractor(const TensorEvaluator<CVQual  TensorStridingSlicingOp<StartIndices, StopIndices,Strides, XprType>, Dev>& expr)\
-  : xprExpr(expr.impl()), m_startIndices(expr.exprStartIndices()), m_stopIndices(expr.exprStopIndices()), m_strides(expr.strides()) {}\
-  EIGEN_STRONG_INLINE  const StartIndices& startIndices() const { return m_startIndices; }\
-  EIGEN_STRONG_INLINE  const StartIndices& stopIndices() const { return m_stopIndices; }\
-  EIGEN_STRONG_INLINE  const StartIndices& strides() const { return m_strides; }\
-};
-
-SYCLEXTRFUNCTSLICESTRIDEOP(const)
-SYCLEXTRFUNCTSLICESTRIDEOP()
-#undef SYCLEXTRFUNCTSLICESTRIDEOP
-
-// Had to separate reshapeOP otherwise it will be mistaken by UnaryCategory
-#define SYCLRESHAPEANDSHUFFLEOPFUNCEXT(OPEXPR, FUNCCALL, CVQual)\
-template<typename Param, typename XprType, typename Dev>\
-struct FunctorExtractor<Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev> > {\
-  FunctorExtractor<Eigen::TensorEvaluator<XprType, Dev> > xprExpr;\
-  const Param m_param;\
-  EIGEN_STRONG_INLINE const Param& param() const { return m_param; }\
-  FunctorExtractor(const Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev>& expr)\
-  : xprExpr(expr.impl()), m_param(expr.FUNCCALL) {}\
-};
-
-SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), const)
-SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorReshapingOp, dimensions(), )
-
-SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), const)
-SYCLRESHAPEANDSHUFFLEOPFUNCEXT(TensorShufflingOp, shufflePermutation(), )
-#undef SYCLRESHAPEANDSHUFFLEOPFUNCEXT
-
-// Had to separate reshapeOP otherwise it will be mistaken by UnaryCategory
-#define PADDINGOPFUNCEXT(OPEXPR, FUNCCALL, SCALARFUNCCALL, CVQual)\
-template<typename Param, typename XprType, typename Dev>\
-struct FunctorExtractor<Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev> > {\
-  FunctorExtractor<Eigen::TensorEvaluator<XprType, Dev> > xprExpr;\
-  const Param m_param;\
-  typedef typename Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev>::Scalar Scalar;\
-  const Scalar m_scalar_param;\
-  EIGEN_STRONG_INLINE const Param& param() const { return m_param; }\
-  EIGEN_STRONG_INLINE const Scalar& scalar_param() const { return m_scalar_param; }\
-  FunctorExtractor(const Eigen::TensorEvaluator<CVQual Eigen::OPEXPR<Param, XprType>, Dev>& expr)\
-  : xprExpr(expr.impl()), m_param(expr.FUNCCALL), m_scalar_param(expr.SCALARFUNCCALL)  {}\
-};
-
-PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), const)
-PADDINGOPFUNCEXT(TensorPaddingOp, padding(), padding_value(), )
-#undef PADDINGOPFUNCEXT
-
-/// specialisation of the \ref FunctorExtractor struct when the node type is TensorContractionOp and TensorConcatenationOp
-/// for TensorContractionOp the LHS and RHS here are the original one no need to apply condition on their type.
-#define SYCLEXTRFUNCCONTRACTCONCAT(OPEXPR, FUNCCALL, CVQual)\
-template <typename Param, typename LHSExpr, typename RHSExpr, typename Dev>\
-struct FunctorExtractor<TensorEvaluator<CVQual OPEXPR<Param, LHSExpr, RHSExpr>, Dev> > {\
-  FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;\
-  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;\
-  const Param func;\
-  FunctorExtractor(const TensorEvaluator<CVQual OPEXPR<Param, LHSExpr, RHSExpr>, Dev>& expr)\
-  : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.FUNCCALL) {}\
-};
-
-// TensorConcatenationOp
-SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(), const)
-SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(),)
-#undef SYCLEXTRFUNCCONTRACTCONCAT
-
-//TensorChippingOp
-#define SYCLEXTRFUNCCHIPPINGOP(CVQual)\
-template<DenseIndex DimId, typename XprType, typename Device>\
-struct FunctorExtractor<TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Device>>{\
-  FunctorExtractor<Eigen::TensorEvaluator<XprType, Device> > xprExpr;\
-  const DenseIndex m_dim;\
-  const DenseIndex m_offset;\
-  EIGEN_STRONG_INLINE const DenseIndex& dimId() const { return m_dim; }\
-  EIGEN_STRONG_INLINE const DenseIndex& offset() const { return m_offset; }\
-  FunctorExtractor(const TensorEvaluator<CVQual TensorChippingOp<DimId, XprType>, Device>& expr)\
-  : xprExpr(expr.impl()), m_dim(expr.dimId()), m_offset(expr.offset()) {}\
+template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
+struct FunctorExtractor<TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{
+  typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Evaluator;
+  typedef typename Eigen::internal::conditional<Evaluator::NumOutputDims==0, DSizes<typename Evaluator::Index, 1>, typename Evaluator::Dimensions >::type Dimensions;
+  const Dimensions m_dimensions;
+  const Dimensions& dimensions() const { return m_dimensions; }
+  FunctorExtractor(const TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>& expr)
+  : m_dimensions(DimConstr<Dimensions, Evaluator::NumOutputDims>::getDim(expr.dimensions())) {}
 };
 
-SYCLEXTRFUNCCHIPPINGOP(const)
-SYCLEXTRFUNCCHIPPINGOP()
-#undef SYCLEXTRFUNCCHIPPINGOP
 
+template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
+struct FunctorExtractor<TensorEvaluator<TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>
+: FunctorExtractor<TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{};
 /// template deduction function for FunctorExtractor
 template <typename Evaluator>
 auto inline extractFunctors(const Evaluator& evaluator)-> FunctorExtractor<Evaluator> {
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
deleted file mode 100644
index 2f77790..0000000
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
+++ /dev/null
@@ -1,245 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: eigen@codeplay.com
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// General include header of SYCL target for Tensor Module
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-
-  template<typename CoeffReturnType, typename OP, typename OutputAccessor, typename InputAccessor, typename LocalAccessor> struct GenericKernelReducer{
-    OP op;
-    OutputAccessor aOut;
-    InputAccessor aI;
-    LocalAccessor scratch;
-    size_t length, local;
-    GenericKernelReducer(OP op_, OutputAccessor aOut_, InputAccessor aI_, LocalAccessor scratch_, size_t length_, size_t local_)
-    : op(op_), aOut(aOut_), aI(aI_), scratch(scratch_), length(length_), local(local_){}
-    void operator()(cl::sycl::nd_item<1> itemID) {
-      size_t globalid = itemID.get_global(0);
-      size_t localid = itemID.get_local(0);
-      /* All threads collectively read from global memory into local.
-       * The barrier ensures all threads' IO is resolved before
-       * execution continues (strictly speaking, all threads within
-       * a single work-group - there is no co-ordination between
-       * work-groups, only work-items). */
-      if (globalid < length) {
-        scratch[localid] = aI[globalid];
-      }
-      itemID.barrier(cl::sycl::access::fence_space::local_space);
-
-      /* Apply the reduction operation between the current local
-       * id and the one on the other half of the vector. */
-      if (globalid < length) {
-        auto min = (length < local) ? length : local;
-        for (size_t offset = min / 2; offset > 0; offset /= 2) {
-          if (localid < offset) {
-            auto accum = op.initialize();
-            op.reduce(scratch[localid], &accum);
-            op.reduce(scratch[localid + offset], &accum);
-            op.finalize(accum);
-            scratch[localid]=accum;
-            //scratch[localid] += scratch[localid + offset];
-          }
-          itemID.barrier(cl::sycl::access::fence_space::local_space);
-        }
-        /* The final result will be stored in local id 0. */
-        if (localid == 0) {
-          aI[itemID.get_group(0)] = scratch[localid];
-          if((length<=local) && globalid ==0){
-            auto aOutPtr = ConvertToActualTypeSycl(CoeffReturnType, aOut);
-            aOutPtr[0]=scratch[0];
-          }
-        }
-      }
-    }
-
-  };
-
-/// ReductionFunctor
-template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Op, typename Index> class ReductionFunctor {
- public:
-  typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-  typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
-  ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_, Op functor_, Index range_, Index)
-  :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(functor_), range(range_) {}
-  void operator()(cl::sycl::nd_item<1> itemID) {
-
-    typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
-    auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
-    /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
-    /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
-    /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
-    const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
-    /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
-    /// the device_evaluator is detectable and recognisable on the device.
-    typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf;
-    auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
-    auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
-    /// const cast added as a naive solution to solve the qualifier drop error
-    auto globalid=static_cast<Index>(itemID.get_global_linear_id());
-    if (globalid< range) {
-      typename DeviceSelf::CoeffReturnType accum = functor.initialize();
-      Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
-      functor.finalize(accum);
-      output_accessor_ptr[globalid]= accum;
-    }
-  }
- private:
-  write_accessor output_accessor;
-  FunctorExpr functors;
-  Tuple_of_Acc tuple_of_accessors;
-  Dims dims;
-  Op functor;
-  Index range;
-};
-
-template < typename HostExpr, typename FunctorExpr, typename Tuple_of_Acc, typename Dims, typename Index>
-class ReductionFunctor<HostExpr, FunctorExpr, Tuple_of_Acc, Dims, Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>, Index> {
- public:
-  typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-  typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
-  typedef Eigen::internal::SumReducer<typename HostExpr::CoeffReturnType> Op;
-  ReductionFunctor(write_accessor output_accessor_, FunctorExpr functors_, Tuple_of_Acc tuple_of_accessors_,Dims dims_,
-    Eigen::internal::MeanReducer<typename HostExpr::CoeffReturnType>,  Index range_, Index num_values_to_reduce_)
-  :output_accessor(output_accessor_), functors(functors_), tuple_of_accessors(tuple_of_accessors_), dims(dims_), functor(Op()), range(range_), num_values_to_reduce(num_values_to_reduce_) {}
-  void operator()(cl::sycl::nd_item<1> itemID) {
-
-    typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
-    auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
-    /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
-    /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
-    /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
-    const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
-    /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
-    /// the device_evaluator is detectable and recognisable on the device.
-    typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeviceSelf;
-    auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
-    auto output_accessor_ptr =ConvertToActualTypeSycl(typename DeviceSelf::CoeffReturnType, output_accessor);
-    /// const cast added as a naive solution to solve the qualifier drop error
-    auto globalid=static_cast<Index>(itemID.get_global_linear_id());
-    if (globalid< range) {
-      typename DeviceSelf::CoeffReturnType accum = functor.initialize();
-      Eigen::internal::GenericDimReducer<DeviceSelf::NumReducedDims-1, DeviceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(static_cast<typename DevExpr::Index>(globalid)),const_cast<Op&>(functor), &accum);
-      functor.finalize(accum);
-      output_accessor_ptr[globalid]= accum/num_values_to_reduce;
-    }
-  }
- private:
-  write_accessor output_accessor;
-  FunctorExpr functors;
-  Tuple_of_Acc tuple_of_accessors;
-  Dims dims;
-  Op functor;
-  Index range;
-  Index num_values_to_reduce;
-};
-
-template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr, typename Op, typename Dims, typename Index, typename TupleType>
-class FullReductionKernelFunctor{
-public:
-  typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-  OutAccessor tmp_global_accessor;
-  Index rng , remaining, red_factor;
-  Op op;
-  Dims dims;
-  FunctorExpr functors;
-  TupleType tuple_of_accessors;
-
-  FullReductionKernelFunctor(OutAccessor acc,   Index rng_, Index remaining_, Index red_factor_, Op op_, Dims dims_, FunctorExpr functors_, TupleType t_acc)
-  :tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(op_), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){}
-
-  void operator()(cl::sycl::nd_item<1> itemID) {
-
-    typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
-    auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
-    /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
-    /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
-    /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
-    const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
-    /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
-    /// the device_evaluator is detectable and recognisable on the device.
-    auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
-    /// const cast added as a naive solution to solve the qualifier drop error
-    auto globalid=itemID.get_global_linear_id();
-
-    tmp_global_accessor.get_pointer()[globalid]=(globalid<rng) ? Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op))
-    : static_cast<CoeffReturnType>(op.initialize());
-
-    if(remaining!=0 && globalid==0 ){
-      // this will add the rest of input buffer when the input size is not devidable to red_factor.
-      auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::
-      reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
-      auto accum = op.initialize();
-      op.reduce(tmp_global_accessor.get_pointer()[0], &accum);
-      op.reduce(remaining_reduce, &accum);
-      op.finalize(accum);
-      tmp_global_accessor.get_pointer()[0]=accum;
-
-    }
-  }
-};
-
-template<typename CoeffReturnType ,typename OutAccessor, typename HostExpr, typename FunctorExpr,  typename Dims, typename Index, typename TupleType>
-class FullReductionKernelFunctor<CoeffReturnType, OutAccessor, HostExpr, FunctorExpr, Eigen::internal::MeanReducer<CoeffReturnType>, Dims, Index, TupleType>{
-public:
-  typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
-  typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
-
-  OutAccessor tmp_global_accessor;
-  Index rng , remaining, red_factor;
-  Op op;
-  Dims dims;
-  FunctorExpr functors;
-  TupleType tuple_of_accessors;
-
-  FullReductionKernelFunctor(OutAccessor acc,   Index rng_, Index remaining_, Index red_factor_, Eigen::internal::MeanReducer<CoeffReturnType>, Dims dims_, FunctorExpr functors_, TupleType t_acc)
-  :tmp_global_accessor(acc), rng(rng_), remaining(remaining_), red_factor(red_factor_),op(Op()), dims(dims_), functors(functors_), tuple_of_accessors(t_acc){}
-
-  void operator()(cl::sycl::nd_item<1> itemID) {
-
-    typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
-    auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
-    /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
-    /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
-    /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
-    const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
-    /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
-    /// the device_evaluator is detectable and recognisable on the device.
-    auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
-    /// const cast added as a naive solution to solve the qualifier drop error
-    auto globalid=itemID.get_global_linear_id();
-    auto scale = (rng*red_factor) + remaining;
-
-    tmp_global_accessor.get_pointer()[globalid]= (globalid<rng)? ((Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*globalid), red_factor, const_cast<Op&>(op)))/scale)
-    :static_cast<CoeffReturnType>(op.initialize())/scale;
-
-    if(remaining!=0 && globalid==0 ){
-      // this will add the rest of input buffer when the input size is not devidable to red_factor.
-      auto remaining_reduce =Eigen::internal::InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(op));
-      auto accum = op.initialize();
-      tmp_global_accessor.get_pointer()[0]= tmp_global_accessor.get_pointer()[0]*scale;
-      op.reduce(tmp_global_accessor.get_pointer()[0], &accum);
-      op.reduce(remaining_reduce, &accum);
-      op.finalize(accum);
-      tmp_global_accessor.get_pointer()[0]=accum/scale;
-
-    }
-  }
-};
-
-}
-}
-}
-#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCLFUNCTORS_H
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
index a1c112f..25d1fac 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
@@ -44,120 +44,68 @@ struct CategoryCount<Arg,Args...>{
 };
 
 /// specialisation of the \ref LeafCount struct when the node type is const TensorMap
-#define SYCLTENSORMAPLEAFCOUNT(CVQual)\
-template <typename PlainObjectType, int Options_, template <class> class MakePointer_>\
-struct LeafCount<CVQual TensorMap<PlainObjectType, Options_, MakePointer_> > {\
-  static const size_t Count =1;\
+template <typename PlainObjectType, int Options_, template <class> class MakePointer_>
+struct LeafCount<const TensorMap<PlainObjectType, Options_, MakePointer_> > {
+  static const size_t Count =1;
 };
 
-SYCLTENSORMAPLEAFCOUNT(const)
-SYCLTENSORMAPLEAFCOUNT()
-#undef SYCLTENSORMAPLEAFCOUNT
+/// specialisation of the \ref LeafCount struct when the node type is TensorMap
+template <typename PlainObjectType, int Options_, template <class> class MakePointer_>
+struct LeafCount<TensorMap<PlainObjectType, Options_, MakePointer_> > :LeafCount<const TensorMap<PlainObjectType, Options_, MakePointer_> >{};
 
-//  TensorCwiseUnaryOp,  TensorCwiseNullaryOp,  TensorCwiseBinaryOp,  TensorCwiseTernaryOp, and  TensorBroadcastingOp
-#define SYCLCATEGORYLEAFCOUNT(CVQual)\
-template <template <class, class...> class CategoryExpr, typename OP, typename... RHSExpr>\
-struct LeafCount<CVQual CategoryExpr<OP, RHSExpr...> >: CategoryCount<RHSExpr...> {};
-
-SYCLCATEGORYLEAFCOUNT(const)
-SYCLCATEGORYLEAFCOUNT()
-#undef SYCLCATEGORYLEAFCOUNT
+// const TensorCwiseUnaryOp, const TensorCwiseNullaryOp, const TensorCwiseBinaryOp, const TensorCwiseTernaryOp, and Const TensorBroadcastingOp
+template <template <class, class...> class CategoryExpr, typename OP, typename... RHSExpr>
+struct LeafCount<const CategoryExpr<OP, RHSExpr...> >: CategoryCount<RHSExpr...> {};
+// TensorCwiseUnaryOp,  TensorCwiseNullaryOp,  TensorCwiseBinaryOp,  TensorCwiseTernaryOp, and  TensorBroadcastingOp
+template <template <class, class...> class CategoryExpr, typename OP, typename... RHSExpr>
+struct LeafCount<CategoryExpr<OP, RHSExpr...> > :LeafCount<const CategoryExpr<OP, RHSExpr...> >{};
 
 /// specialisation of the \ref LeafCount struct when the node type is const TensorSelectOp is an exception
-#define SYCLSELECTOPLEAFCOUNT(CVQual)\
-template <typename IfExpr, typename ThenExpr, typename ElseExpr>\
-struct LeafCount<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr> > : CategoryCount<IfExpr, ThenExpr, ElseExpr> {};
-
-SYCLSELECTOPLEAFCOUNT(const)
-SYCLSELECTOPLEAFCOUNT()
-#undef SYCLSELECTOPLEAFCOUNT
+template <typename IfExpr, typename ThenExpr, typename ElseExpr>
+struct LeafCount<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr> > : CategoryCount<IfExpr, ThenExpr, ElseExpr> {};
+/// specialisation of the \ref LeafCount struct when the node type is TensorSelectOp
+template <typename IfExpr, typename ThenExpr, typename ElseExpr>
+struct LeafCount<TensorSelectOp<IfExpr, ThenExpr, ElseExpr> >: LeafCount<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr> > {};
 
 
-/// specialisation of the \ref LeafCount struct when the node type is TensorAssignOp
-#define SYCLLEAFCOUNTASSIGNOP(CVQual)\
-template <typename LHSExpr, typename RHSExpr>\
-struct LeafCount<CVQual TensorAssignOp<LHSExpr, RHSExpr> >: CategoryCount<LHSExpr,RHSExpr> {};
+/// specialisation of the \ref LeafCount struct when the node type is const TensorAssignOp
+template <typename LHSExpr, typename RHSExpr>
+struct LeafCount<const TensorAssignOp<LHSExpr, RHSExpr> >: CategoryCount<LHSExpr,RHSExpr> {};
 
-SYCLLEAFCOUNTASSIGNOP(const)
-SYCLLEAFCOUNTASSIGNOP()
-#undef SYCLLEAFCOUNTASSIGNOP
+/// specialisation of the \ref LeafCount struct when the node type is
+/// TensorAssignOp is an exception. It is not the same as Unary
+template <typename LHSExpr, typename RHSExpr>
+struct LeafCount<TensorAssignOp<LHSExpr, RHSExpr> > :LeafCount<const TensorAssignOp<LHSExpr, RHSExpr> >{};
 
 /// specialisation of the \ref LeafCount struct when the node type is const TensorForcedEvalOp
-#define SYCLFORCEDEVALLEAFCOUNT(CVQual)\
-template <typename Expr>\
-struct LeafCount<CVQual TensorForcedEvalOp<Expr> > {\
-    static const size_t Count =1;\
+template <typename Expr>
+struct LeafCount<const TensorForcedEvalOp<Expr> > {
+    static const size_t Count =1;
 };
 
-SYCLFORCEDEVALLEAFCOUNT(const)
-SYCLFORCEDEVALLEAFCOUNT()
-#undef SYCLFORCEDEVALLEAFCOUNT
+/// specialisation of the \ref LeafCount struct when the node type is TensorForcedEvalOp
+template <typename Expr>
+struct LeafCount<TensorForcedEvalOp<Expr> >: LeafCount<const TensorForcedEvalOp<Expr> > {};
 
-/// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp
-#define EVALTOLEAFCOUNT(CVQual)\
-template <typename Expr>\
-struct LeafCount<CVQual TensorEvalToOp<Expr> > {\
-  static const size_t Count = 1 + CategoryCount<Expr>::Count;\
+/// specialisation of the \ref LeafCount struct when the node type is const TensorEvalToOp
+template <typename Expr>
+struct LeafCount<const TensorEvalToOp<Expr> > {
+  static const size_t Count = 1 + CategoryCount<Expr>::Count;
 };
 
-EVALTOLEAFCOUNT(const)
-EVALTOLEAFCOUNT()
-#undef EVALTOLEAFCOUNT
-
 /// specialisation of the \ref LeafCount struct when the node type is const TensorReductionOp
-#define REDUCTIONLEAFCOUNT(CVQual)\
-template <typename OP, typename Dim, typename Expr>\
-struct LeafCount<CVQual TensorReductionOp<OP, Dim, Expr> > {\
-    static const size_t Count =1;\
+template <typename OP, typename Dim, typename Expr>
+struct LeafCount<const TensorReductionOp<OP, Dim, Expr> > {
+    static const size_t Count =1;
 };
 
-REDUCTIONLEAFCOUNT(const)
-REDUCTIONLEAFCOUNT()
-#undef REDUCTIONLEAFCOUNT
-
-/// specialisation of the \ref LeafCount struct when the node type is const TensorContractionOp
-#define CONTRACTIONCONVOLUTIONLEAFCOUNT(CVQual, ExprNode)\
-template <typename Indices, typename LhsXprType, typename RhsXprType>\
-struct LeafCount<CVQual ExprNode<Indices, LhsXprType, RhsXprType> > {\
-    static const size_t Count =1;\
-};
-
-CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorContractionOp)
-CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorContractionOp)
-CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorConvolutionOp)
-CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorConvolutionOp)
-#undef CONTRACTIONCONVOLUTIONLEAFCOUNT
-
-
-
-/// specialisation of the \ref LeafCount struct when the node type is  TensorSlicingOp
-#define SLICEOPLEAFCOUNT(CVQual)\
-template <typename StartIndices, typename Sizes, typename XprType>\
-struct LeafCount<CVQual TensorSlicingOp<StartIndices, Sizes, XprType> >:CategoryCount<XprType>{};
-
-SLICEOPLEAFCOUNT(const)
-SLICEOPLEAFCOUNT()
-#undef SLICEOPLEAFCOUNT
-
-
-/// specialisation of the \ref LeafCount struct when the node type is  TensorChippingOp
-#define CHIPPINGOPLEAFCOUNT(CVQual)\
-template <DenseIndex DimId, typename XprType>\
-struct LeafCount<CVQual TensorChippingOp<DimId, XprType> >:CategoryCount<XprType>{};
-
-CHIPPINGOPLEAFCOUNT(const)
-CHIPPINGOPLEAFCOUNT()
-#undef CHIPPINGOPLEAFCOUNT
-
-
-#define SLICESTRIDEOPLEAFCOUNT(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>\
-struct LeafCount<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >:CategoryCount<XprType>{};
-
-SLICESTRIDEOPLEAFCOUNT(const)
-SLICESTRIDEOPLEAFCOUNT()
-#undef SLICESTRIDEOPLEAFCOUNT
+/// specialisation of the \ref LeafCount struct when the node type is TensorReductionOp
+template <typename OP, typename Dim, typename Expr>
+struct LeafCount<TensorReductionOp<OP, Dim, Expr> >: LeafCount<const TensorReductionOp<OP, Dim, Expr> >{};
 
+/// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp
+template <typename Expr>
+struct LeafCount<TensorEvalToOp<Expr> >: LeafCount<const TensorEvalToOp<Expr> >{};
 
 } /// namespace TensorSycl
 } /// namespace internal
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
index 74566dc..d4c250c 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
@@ -122,9 +122,9 @@ ASSIGNEXPR()
 /// specialisation of the \ref PlaceHolderExpression when the node is
 /// TensorMap
 #define TENSORMAPEXPR(CVQual)\
-template <typename T, int Options_, template <class> class MakePointer_, size_t N>\
-struct PlaceHolderExpression< CVQual TensorMap< T, Options_, MakePointer_>, N> {\
-  typedef CVQual PlaceHolder<CVQual TensorMap<T, Options_, MakePointer_>, N> Type;\
+template <typename Scalar_, int Options_, int Options2_, int NumIndices_, typename IndexType_, template <class> class MakePointer_, size_t N>\
+struct PlaceHolderExpression< CVQual TensorMap< Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakePointer_>, N> {\
+  typedef CVQual PlaceHolder<CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakePointer_>, N> Type;\
 };
 
 TENSORMAPEXPR(const)
@@ -157,18 +157,6 @@ EVALTO()
 
 
 /// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorChippingOp
-#define CHIPPINGOP(CVQual)\
-template <DenseIndex DimId, typename Expr, size_t N>\
-struct PlaceHolderExpression<CVQual TensorChippingOp<DimId, Expr>, N> {\
-  typedef CVQual TensorChippingOp< DimId, typename CalculateIndex <N, Expr>::ArgType> Type;\
-};
-
-CHIPPINGOP(const)
-CHIPPINGOP()
-#undef CHIPPINGOP
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
 /// TensorReductionOp
 #define SYCLREDUCTION(CVQual)\
 template <typename OP, typename Dims, typename Expr, size_t N>\
@@ -179,45 +167,6 @@ SYCLREDUCTION(const)
 SYCLREDUCTION()
 #undef SYCLREDUCTION
 
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorReductionOp
-#define SYCLCONTRACTIONCONVOLUTIONPLH(CVQual, ExprNode)\
-template <typename Indices, typename LhsXprType, typename RhsXprType, size_t N>\
-struct PlaceHolderExpression<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>{\
-  typedef CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N> Type;\
-};
-SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTIONPLH(,TensorContractionOp)
-SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorConvolutionOp)
-SYCLCONTRACTIONCONVOLUTIONPLH(,TensorConvolutionOp)
-#undef SYCLCONTRACTIONCONVOLUTIONPLH
-
-
-/// specialisation of the \ref PlaceHolderExpression when the node is
-/// TensorCwiseSelectOp
-#define SLICEOPEXPR(CVQual)\
-template <typename StartIndices, typename Sizes, typename XprType, size_t N>\
-struct PlaceHolderExpression<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, N> {\
-  typedef CVQual TensorSlicingOp<StartIndices, Sizes, typename CalculateIndex<N, XprType>::ArgType> Type;\
-};
-
-SLICEOPEXPR(const)
-SLICEOPEXPR()
-#undef SLICEOPEXPR
-
-
-#define SYCLSLICESTRIDEOPPLH(CVQual)\
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, size_t N>\
-struct PlaceHolderExpression<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, N> {\
-  typedef CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, typename CalculateIndex<N, XprType>::ArgType> Type;\
-};
-
-SYCLSLICESTRIDEOPPLH(const)
-SYCLSLICESTRIDEOPPLH()
-#undef SYCLSLICESTRIDEOPPLH
-
-
 /// template deduction for \ref PlaceHolderExpression struct
 template <typename Expr>
 struct createPlaceHolderExpression {
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
index cac7855..7914b6f 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
@@ -25,70 +25,43 @@
 
 namespace Eigen {
 namespace TensorSycl {
-
-template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecExprFunctorKernel{
-  typedef  typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr;
-
-  typedef typename Expr::Index Index;
-  FunctorExpr functors;
-  TupleType tuple_of_accessors;
-  Index range;
-  ExecExprFunctorKernel(Index range_, FunctorExpr functors_, TupleType tuple_of_accessors_)
-    : functors(functors_), tuple_of_accessors(tuple_of_accessors_), range(range_){}
-  void operator()(cl::sycl::nd_item<1> itemID) {
-    typedef  typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr;
-    auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
-    auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
-    typename DevExpr::Index gId = static_cast<typename DevExpr::Index>(itemID.get_global_linear_id());
-    if (gId < range)
-      device_evaluator.evalScalar(gId);
-  }
-};
-
 /// The run function in tensor sycl convert the expression tree to a buffer
 /// based expression tree;
 /// creates the expression tree for the device with accessor to buffers;
 /// construct the kernel and submit it to the sycl queue.
-/// std::array does not have TotalSize. So I have to get the size through template specialisation.
-template<typename , typename Dimensions> struct DimensionSize{
-  static auto getDimSize(const Dimensions& dim)->decltype(dim.TotalSize()){
-    return dim.TotalSize();
-  }
-};
-#define DIMSIZEMACRO(CVQual)\
-template<typename Index, size_t NumDims> struct DimensionSize<Index, CVQual std::array<Index, NumDims>>{\
-  static inline Index getDimSize(const std::array<Index, NumDims>& dim){\
-    return (NumDims == 0) ? 1 : ::Eigen::internal::array_prod(dim);\
-  }\
-};
-
-DIMSIZEMACRO(const)
-DIMSIZEMACRO()
-#undef DIMSIZEMACRO
-
-
 template <typename Expr, typename Dev>
 void run(Expr &expr, Dev &dev) {
   Eigen::TensorEvaluator<Expr, Dev> evaluator(expr, dev);
   const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
   if (needs_assign) {
-    typedef Eigen::TensorSycl::internal::FunctorExtractor<Eigen::TensorEvaluator<Expr, Dev> > FunctorExpr;
-    FunctorExpr functors = internal::extractFunctors(evaluator);
-    dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
-      // create a tuple of accessors from Evaluator
-      typedef decltype(internal::createTupleOfAccessors<Eigen::TensorEvaluator<Expr, Dev> >(cgh, evaluator)) TupleType;
-      TupleType tuple_of_accessors = internal::createTupleOfAccessors<Eigen::TensorEvaluator<Expr, Dev> >(cgh, evaluator);
-      typename Expr::Index range, GRange, tileSize;
-      typename Expr::Index total_size = static_cast<typename Expr::Index>(DimensionSize<typename Expr::Index, typename Eigen::TensorEvaluator<Expr, Dev>::Dimensions>::getDimSize(evaluator.dimensions()));
-      dev.parallel_for_setup(total_size, tileSize, range, GRange);
+    typedef  typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr;
+    auto functors = internal::extractFunctors(evaluator);
 
-      cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
-      ExecExprFunctorKernel<Expr,FunctorExpr,TupleType>(range
-        , functors, tuple_of_accessors
-      ));
+    size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
+    dev.m_queue.submit([&](cl::sycl::handler &cgh) {
+
+      // create a tuple of accessors from Evaluator
+      auto tuple_of_accessors = internal::createTupleOfAccessors<decltype(evaluator)>(cgh, evaluator);
+      const auto range = utility::tuple::get<0>(tuple_of_accessors).get_range()[0];
+      size_t GRange=range;
+      if (tileSize>GRange) tileSize=GRange;
+      else if(GRange>tileSize){
+        size_t xMode = GRange % tileSize;
+        if (xMode != 0) GRange += (tileSize - xMode);
+      }
+      // run the kernel
+      cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
+        typedef  typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr;
+        auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+        auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
+        if (itemID.get_global_linear_id() < range) {
+          device_evaluator.evalScalar(static_cast<int>(itemID.get_global_linear_id()));
+        }
+      });
     });
-      dev.asynchronousExec();
+    dev.m_queue.throw_asynchronous();
   }
+
   evaluator.cleanup();
 }
 }  // namespace TensorSycl
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
index 58ab0f0..063b027 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
@@ -20,7 +20,6 @@
 
 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
-
 namespace utility {
 namespace tuple {
 /// \struct StaticIf
@@ -232,5 +231,4 @@ Tuple<Args1..., Args2...> append(Tuple<Args1...> t1,Tuple<Args2...> t2) {
 }
 }  // tuple
 }  // utility
-
 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
index a1e944e..ffcf8b0 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@@ -58,8 +58,6 @@ struct traits<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
   };
   template <typename T> struct MakePointer {
     typedef T* Type;
-    typedef T& RefType;
-
   };
 };
 
@@ -78,8 +76,6 @@ struct traits<TensorFixedSize<Scalar_, Dimensions, Options_, IndexType_> >
   };
   template <typename T> struct MakePointer {
     typedef T* Type;
-    typedef T& RefType;
-
   };
 };
 
@@ -102,8 +98,6 @@ struct traits<TensorMap<PlainObjectType, Options_, MakePointer_> >
     // Intermediate typedef to workaround MSVC issue.
     typedef MakePointer_<T> MakePointerT;
     typedef typename MakePointerT::Type Type;
-    typedef typename MakePointerT::RefType RefType;
-
   };
 };
 
diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
index d23f2e4..3523e7c 100644
--- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
+++ b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@@ -23,7 +23,6 @@ struct static_val {
 
   template <typename T>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) {
-    EIGEN_UNUSED_VARIABLE(v);
     eigen_assert(v == n);
   }
 };
diff --git a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
index 9dcc9da..354bce5 100644
--- a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+++ b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@@ -20,13 +20,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
   typedef RunQueue<Task, 1024> Queue;
 
   NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment())
-      : NonBlockingThreadPoolTempl(num_threads, true, env) {}
-
-  NonBlockingThreadPoolTempl(int num_threads, bool allow_spinning,
-                             Environment env = Environment())
-      : num_threads_(num_threads),
-        allow_spinning_(allow_spinning),
-        env_(env),
+      : env_(env),
         threads_(num_threads),
         queues_(num_threads),
         coprimes_(num_threads),
@@ -34,20 +28,19 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
         blocked_(0),
         spinning_(0),
         done_(false),
-        cancelled_(false),
         ec_(waiters_) {
-    waiters_.resize(num_threads_);
+    waiters_.resize(num_threads);
 
-    // Calculate coprimes of num_threads_.
+    // Calculate coprimes of num_threads.
     // Coprimes are used for a random walk over all threads in Steal
     // and NonEmptyQueueIndex. Iteration is based on the fact that if we take
     // a walk starting thread index t and calculate num_threads - 1 subsequent
     // indices as (t + coprime) % num_threads, we will cover all threads without
     // repetitions (effectively getting a presudo-random permutation of thread
     // indices).
-    for (int i = 1; i <= num_threads_; i++) {
+    for (int i = 1; i <= num_threads; i++) {
       unsigned a = i;
-      unsigned b = num_threads_;
+      unsigned b = num_threads;
       // If GCD(a, b) == 1, then a and b are coprimes.
       while (b != 0) {
         unsigned tmp = a;
@@ -58,33 +51,24 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
         coprimes_.push_back(i);
       }
     }
-    for (int i = 0; i < num_threads_; i++) {
+    for (int i = 0; i < num_threads; i++) {
       queues_.push_back(new Queue());
     }
-    for (int i = 0; i < num_threads_; i++) {
+    for (int i = 0; i < num_threads; i++) {
       threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); }));
     }
   }
 
   ~NonBlockingThreadPoolTempl() {
     done_ = true;
-
     // Now if all threads block without work, they will start exiting.
     // But note that threads can continue to work arbitrary long,
     // block, submit new work, unblock and otherwise live full life.
-    if (!cancelled_) {
-      ec_.Notify(true);
-    } else {
-      // Since we were cancelled, there might be entries in the queues.
-      // Empty them to prevent their destructor from asserting.
-      for (size_t i = 0; i < queues_.size(); i++) {
-        queues_[i]->Flush();
-      }
-    }
+    ec_.Notify(true);
 
     // Join threads explicitly to avoid destruction order issues.
-    for (size_t i = 0; i < num_threads_; i++) delete threads_[i];
-    for (size_t i = 0; i < num_threads_; i++) delete queues_[i];
+    for (size_t i = 0; i < threads_.size(); i++) delete threads_[i];
+    for (size_t i = 0; i < threads_.size(); i++) delete queues_[i];
   }
 
   void Schedule(std::function<void()> fn) {
@@ -107,31 +91,14 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
     // completes overall computations, which in turn leads to destruction of
     // this. We expect that such scenario is prevented by program, that is,
     // this is kept alive while any threads can potentially be in Schedule.
-    if (!t.f) {
+    if (!t.f)
       ec_.Notify(false);
-    }
-    else {
+    else
       env_.ExecuteTask(t);  // Push failed, execute directly.
-    }
-  }
-
-  void Cancel() {
-    cancelled_ = true;
-    done_ = true;
-
-    // Let each thread know it's been cancelled.
-#ifdef EIGEN_THREAD_ENV_SUPPORTS_CANCELLATION
-    for (size_t i = 0; i < threads_.size(); i++) {
-      threads_[i]->OnCancel();
-    }
-#endif
-
-    // Wake up the threads without work to let them exit on their own.
-    ec_.Notify(true);
   }
 
   int NumThreads() const final {
-    return num_threads_;
+    return static_cast<int>(threads_.size());
   }
 
   int CurrentThreadId() const final {
@@ -155,8 +122,6 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
   };
 
   Environment env_;
-  const int num_threads_;
-  const bool allow_spinning_;
   MaxSizeVector<Thread*> threads_;
   MaxSizeVector<Queue*> queues_;
   MaxSizeVector<unsigned> coprimes_;
@@ -164,7 +129,6 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
   std::atomic<unsigned> blocked_;
   std::atomic<bool> spinning_;
   std::atomic<bool> done_;
-  std::atomic<bool> cancelled_;
   EventCount ec_;
 
   // Main worker thread loop.
@@ -175,62 +139,32 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
     pt->thread_id = thread_id;
     Queue* q = queues_[thread_id];
     EventCount::Waiter* waiter = &waiters_[thread_id];
-    // TODO(dvyukov,rmlarsen): The time spent in Steal() is proportional
-    // to num_threads_ and we assume that new work is scheduled at a
-    // constant rate, so we set spin_count to 5000 / num_threads_. The
-    // constant was picked based on a fair dice roll, tune it.
-    const int spin_count =
-        allow_spinning_ && num_threads_ > 0 ? 5000 / num_threads_ : 0;
-    if (num_threads_ == 1) {
-      // For num_threads_ == 1 there is no point in going through the expensive
-      // steal loop. Moreover, since Steal() calls PopBack() on the victim
-      // queues it might reverse the order in which ops are executed compared to
-      // the order in which they are scheduled, which tends to be
-      // counter-productive for the types of I/O workloads the single thread
-      // pools tend to be used for.
-      while (!cancelled_) {
-        Task t = q->PopFront();
-        for (int i = 0; i < spin_count && !t.f; i++) {
-          if (!cancelled_.load(std::memory_order_relaxed)) {
-            t = q->PopFront();
-          }
-        }
+    for (;;) {
+      Task t = q->PopFront();
+      if (!t.f) {
+        t = Steal();
         if (!t.f) {
-          if (!WaitForWork(waiter, &t)) {
-            return;
+          // Leave one thread spinning. This reduces latency.
+          // TODO(dvyukov): 1000 iterations is based on fair dice roll, tune it.
+          // Also, the time it takes to attempt to steal work 1000 times depends
+          // on the size of the thread pool. However the speed at which the user
+          // of the thread pool submit tasks is independent of the size of the
+          // pool. Consider a time based limit instead.
+          if (!spinning_ && !spinning_.exchange(true)) {
+            for (int i = 0; i < 1000 && !t.f; i++) {
+              t = Steal();
+            }
+            spinning_ = false;
           }
-        }
-        if (t.f) {
-          env_.ExecuteTask(t);
-        }
-      }
-    } else {
-      while (!cancelled_) {
-        Task t = q->PopFront();
-        if (!t.f) {
-          t = Steal();
           if (!t.f) {
-            // Leave one thread spinning. This reduces latency.
-            if (allow_spinning_ && !spinning_ && !spinning_.exchange(true)) {
-              for (int i = 0; i < spin_count && !t.f; i++) {
-                if (!cancelled_.load(std::memory_order_relaxed)) {
-                  t = Steal();
-                } else {
-                  return;
-                }
-              }
-              spinning_ = false;
-            }
-            if (!t.f) {
-              if (!WaitForWork(waiter, &t)) {
-                return;
-              }
+            if (!WaitForWork(waiter, &t)) {
+              return;
             }
           }
         }
-        if (t.f) {
-          env_.ExecuteTask(t);
-        }
+      }
+      if (t.f) {
+        env_.ExecuteTask(t);
       }
     }
   }
@@ -267,18 +201,14 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
     int victim = NonEmptyQueueIndex();
     if (victim != -1) {
       ec_.CancelWait(waiter);
-      if (cancelled_) {
-        return false;
-      } else {
-        *t = queues_[victim]->PopBack();
-        return true;
-      }
+      *t = queues_[victim]->PopBack();
+      return true;
     }
     // Number of blocked threads is used as termination condition.
     // If we are shutting down and all worker threads blocked without work,
     // that's we are done.
     blocked_++;
-    if (done_ && blocked_ == num_threads_) {
+    if (done_ && blocked_ == threads_.size()) {
       ec_.CancelWait(waiter);
       // Almost done, but need to re-check queues.
       // Consider that all queues are empty and all worker threads are preempted
diff --git a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
index 49d0cdc..05ed76c 100644
--- a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
+++ b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
@@ -177,13 +177,6 @@ class RunQueue {
   // Can be called by any thread at any time.
   bool Empty() const { return Size() == 0; }
 
-  // Delete all the elements from the queue.
-  void Flush() {
-    while (!Empty()) {
-      PopFront();
-    }
-  }
-
  private:
   static const unsigned kMask = kSize - 1;
   static const unsigned kMask2 = (kSize << 1) - 1;
diff --git a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
index 3357286..e75d0f4 100644
--- a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
+++ b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
@@ -69,14 +69,6 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface {
     }
   }
 
-  void Cancel() {
-#ifdef EIGEN_THREAD_ENV_SUPPORTS_CANCELLATION
-    for (size_t i = 0; i < threads_.size(); i++) {
-      threads_[i]->OnCancel();
-    }
-#endif
-  }
-
   int NumThreads() const final {
     return static_cast<int>(threads_.size());
   }
diff --git a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h
deleted file mode 100644
index a05685f..0000000
--- a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h
+++ /dev/null
@@ -1,23 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
-#define EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
-
-// Try to come up with a portable way to cancel a thread
-#if EIGEN_OS_GNULINUX
-  #define EIGEN_THREAD_CANCEL(t)                  \
-    pthread_cancel(t.native_handle());
-  #define EIGEN_SUPPORTS_THREAD_CANCELLATION 1
-#else
-#define EIGEN_THREAD_CANCEL(t)
-#endif
-
-
-#endif  // EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
diff --git a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
index d94a064..399f95c 100644
--- a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
+++ b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
@@ -23,8 +23,6 @@ struct StlThreadEnvironment {
    public:
     EnvThread(std::function<void()> f) : thr_(std::move(f)) {}
     ~EnvThread() { thr_.join(); }
-    // This function is called when the threadpool is cancelled.
-    void OnCancel() { }
 
    private:
     std::thread thr_;
diff --git a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
index 84e1e6c..a65ee97 100644
--- a/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
+++ b/eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
@@ -16,14 +16,8 @@ namespace Eigen {
 // custom thread pools underneath.
 class ThreadPoolInterface {
  public:
-  // Submits a closure to be run by a thread in the pool.
   virtual void Schedule(std::function<void()> fn) = 0;
 
-  // If implemented, stop processing the closures that have been enqueued.
-  // Currently running closures may still be processed.
-  // If not implemented, does nothing.
-  virtual void Cancel() {}
-
   // Returns the number of threads in the pool.
   virtual int NumThreads() const = 0;
 
diff --git a/eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h b/eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
index 49d315a..ec27edd 100644
--- a/eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
+++ b/eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
@@ -40,7 +40,7 @@ template<typename T, T... nn>
 struct numeric_list { constexpr static std::size_t count = sizeof...(nn); };
 
 template<typename T, T n, T... nn>
-struct numeric_list<T, n, nn...> { static const std::size_t count = sizeof...(nn) + 1; const static T first_value = n; };
+struct numeric_list<T, n, nn...> { constexpr static std::size_t count = sizeof...(nn) + 1; constexpr static T first_value = n; };
 
 /* numeric list constructors
  *
@@ -123,10 +123,6 @@ template<typename a, typename... as>                      struct get<0, type_lis
 template<typename T, int n, T a, T... as>                        struct get<n, numeric_list<T, a, as...>>   : get<n-1, numeric_list<T, as...>> {};
 template<typename T, T a, T... as>                               struct get<0, numeric_list<T, a, as...>>   { constexpr static T value = a; };
 
-template<std::size_t n, typename T, T a, T... as> constexpr T       array_get(const numeric_list<T, a, as...>&) {
-   return get<(int)n, numeric_list<T, a, as...>>::value;
-}
-
 /* always get type, regardless of dummy; good for parameter pack expansion */
 
 template<typename T, T dummy, typename t> struct id_numeric  { typedef t type; };
diff --git a/eigen/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/eigen/unsupported/Eigen/CXX11/src/util/EmulateArray.h
index 573ca43..30d3ebc 100644
--- a/eigen/unsupported/Eigen/CXX11/src/util/EmulateArray.h
+++ b/eigen/unsupported/Eigen/CXX11/src/util/EmulateArray.h
@@ -169,7 +169,6 @@ template <typename T> class array<T, 0> {
 
 #if EIGEN_HAS_VARIADIC_TEMPLATES
   EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() {
-    EIGEN_UNUSED_VARIABLE(l);
     eigen_assert(l.size() == 0);
   }
 #endif
@@ -201,15 +200,19 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T,N>& a) {
   return a[I];
 }
 
+template <typename T> struct array_size;
 template<class T, std::size_t N> struct array_size<array<T,N> > {
   static const size_t value = N;
 };
+template <typename T> struct array_size;
 template<class T, std::size_t N> struct array_size<array<T,N>& > {
   static const size_t value = N;
 };
+template <typename T> struct array_size;
 template<class T, std::size_t N> struct array_size<const array<T,N> > {
   static const size_t value = N;
 };
+template <typename T> struct array_size;
 template<class T, std::size_t N> struct array_size<const array<T,N>& > {
   static const size_t value = N;
 };
@@ -248,6 +251,14 @@ template<std::size_t I, class T, std::size_t N> constexpr inline T const& array_
 
 #undef STD_GET_ARR_HACK
 
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<const std::array<T,N> > {
+  static const size_t value = N;
+};
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<std::array<T,N> > {
+  static const size_t value = N;
+};
 }  // end namespace internal
 }  // end namespace Eigen