From f0238cfb6997c4acfc2bd200de7295f3fa36968f Mon Sep 17 00:00:00 2001 From: Stanislaw Halik Date: Sun, 3 Mar 2019 21:09:10 +0100 Subject: don't index Eigen --- eigen/unsupported/CMakeLists.txt | 9 - eigen/unsupported/Eigen/AdolcForward | 156 - eigen/unsupported/Eigen/AlignedVector3 | 224 -- eigen/unsupported/Eigen/ArpackSupport | 31 - eigen/unsupported/Eigen/AutoDiff | 40 - eigen/unsupported/Eigen/BVH | 95 - eigen/unsupported/Eigen/CMakeLists.txt | 32 - eigen/unsupported/Eigen/CXX11/CMakeLists.txt | 8 - eigen/unsupported/Eigen/CXX11/Tensor | 154 - eigen/unsupported/Eigen/CXX11/TensorSymmetry | 42 - eigen/unsupported/Eigen/CXX11/ThreadPool | 65 - eigen/unsupported/Eigen/CXX11/src/Tensor/README.md | 1760 ----------- eigen/unsupported/Eigen/CXX11/src/Tensor/Tensor.h | 527 ---- .../Eigen/CXX11/src/Tensor/TensorArgMax.h | 299 -- .../Eigen/CXX11/src/Tensor/TensorAssign.h | 181 -- .../Eigen/CXX11/src/Tensor/TensorBase.h | 1012 ------- .../Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 392 --- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 384 --- .../Eigen/CXX11/src/Tensor/TensorConcatenation.h | 361 --- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 628 ---- .../CXX11/src/Tensor/TensorContractionBlocking.h | 56 - .../Eigen/CXX11/src/Tensor/TensorContractionCuda.h | 1391 --------- .../CXX11/src/Tensor/TensorContractionMapper.h | 467 --- .../CXX11/src/Tensor/TensorContractionThreadPool.h | 1043 ------- .../Eigen/CXX11/src/Tensor/TensorConversion.h | 279 -- .../Eigen/CXX11/src/Tensor/TensorConvolution.h | 1104 ------- .../Eigen/CXX11/src/Tensor/TensorCostModel.h | 212 -- .../Eigen/CXX11/src/Tensor/TensorCustomOp.h | 313 -- .../Eigen/CXX11/src/Tensor/TensorDevice.h | 68 - .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 337 --- .../Eigen/CXX11/src/Tensor/TensorDeviceDefault.h | 81 - .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 122 - .../CXX11/src/Tensor/TensorDeviceThreadPool.h | 282 -- .../Eigen/CXX11/src/Tensor/TensorDimensionList.h | 236 -- .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 428 --- .../Eigen/CXX11/src/Tensor/TensorEvalTo.h | 181 -- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 633 ---- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 288 -- .../Eigen/CXX11/src/Tensor/TensorExpr.h | 371 --- .../unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h | 651 ---- .../Eigen/CXX11/src/Tensor/TensorFixedSize.h | 389 --- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 169 -- .../CXX11/src/Tensor/TensorForwardDeclarations.h | 109 - .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 489 --- .../Eigen/CXX11/src/Tensor/TensorGenerator.h | 185 -- .../Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h | 33 - .../unsupported/Eigen/CXX11/src/Tensor/TensorIO.h | 79 - .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 509 ---- .../Eigen/CXX11/src/Tensor/TensorIndexList.h | 725 ----- .../Eigen/CXX11/src/Tensor/TensorInflation.h | 229 -- .../Eigen/CXX11/src/Tensor/TensorInitializer.h | 82 - .../Eigen/CXX11/src/Tensor/TensorIntDiv.h | 253 -- .../Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 209 -- .../Eigen/CXX11/src/Tensor/TensorMacros.h | 54 - .../unsupported/Eigen/CXX11/src/Tensor/TensorMap.h | 323 -- .../Eigen/CXX11/src/Tensor/TensorMeta.h | 218 -- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 888 ------ .../Eigen/CXX11/src/Tensor/TensorPadding.h | 397 --- .../Eigen/CXX11/src/Tensor/TensorPatch.h | 269 -- .../Eigen/CXX11/src/Tensor/TensorRandom.h | 276 -- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 781 ----- .../Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 750 ----- .../Eigen/CXX11/src/Tensor/TensorReductionSycl.h | 242 -- .../unsupported/Eigen/CXX11/src/Tensor/TensorRef.h | 429 --- .../Eigen/CXX11/src/Tensor/TensorReverse.h | 288 -- .../Eigen/CXX11/src/Tensor/TensorScan.h | 287 -- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 264 -- .../Eigen/CXX11/src/Tensor/TensorStorage.h | 146 - .../Eigen/CXX11/src/Tensor/TensorStriding.h | 338 --- .../Eigen/CXX11/src/Tensor/TensorSycl.h | 82 - .../Tensor/TensorSyclConvertToDeviceExpression.h | 121 - .../CXX11/src/Tensor/TensorSyclExprConstructor.h | 239 -- .../CXX11/src/Tensor/TensorSyclExtractAccessor.h | 204 -- .../CXX11/src/Tensor/TensorSyclExtractFunctors.h | 177 -- .../Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h | 114 - .../CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h | 181 -- .../Eigen/CXX11/src/Tensor/TensorSyclRun.h | 70 - .../Eigen/CXX11/src/Tensor/TensorSyclTuple.h | 237 -- .../Eigen/CXX11/src/Tensor/TensorTraits.h | 272 -- .../Eigen/CXX11/src/Tensor/TensorUInt128.h | 248 -- .../Eigen/CXX11/src/Tensor/TensorVolumePatch.h | 608 ---- .../CXX11/src/TensorSymmetry/DynamicSymmetry.h | 293 -- .../CXX11/src/TensorSymmetry/StaticSymmetry.h | 236 -- .../Eigen/CXX11/src/TensorSymmetry/Symmetry.h | 338 --- .../src/TensorSymmetry/util/TemplateGroupTheory.h | 669 ----- .../Eigen/CXX11/src/ThreadPool/EventCount.h | 233 -- .../CXX11/src/ThreadPool/NonBlockingThreadPool.h | 274 -- .../Eigen/CXX11/src/ThreadPool/RunQueue.h | 210 -- .../Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h | 154 - .../Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h | 38 - .../Eigen/CXX11/src/ThreadPool/ThreadLocal.h | 22 - .../CXX11/src/ThreadPool/ThreadPoolInterface.h | 33 - .../Eigen/CXX11/src/ThreadPool/ThreadYield.h | 20 - eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h | 542 ---- .../Eigen/CXX11/src/util/CXX11Workarounds.h | 88 - .../Eigen/CXX11/src/util/EmulateArray.h | 267 -- .../Eigen/CXX11/src/util/EmulateCXX11Meta.h | 311 -- .../Eigen/CXX11/src/util/MaxSizeVector.h | 141 - eigen/unsupported/Eigen/EulerAngles | 43 - eigen/unsupported/Eigen/FFT | 419 --- eigen/unsupported/Eigen/IterativeSolvers | 42 - eigen/unsupported/Eigen/KroneckerProduct | 36 - eigen/unsupported/Eigen/LevenbergMarquardt | 45 - eigen/unsupported/Eigen/MPRealSupport | 209 -- eigen/unsupported/Eigen/MatrixFunctions | 500 ---- eigen/unsupported/Eigen/MoreVectorization | 24 - eigen/unsupported/Eigen/NonLinearOptimization | 134 - eigen/unsupported/Eigen/NumericalDiff | 56 - eigen/unsupported/Eigen/OpenGLSupport | 322 -- eigen/unsupported/Eigen/Polynomials | 138 - eigen/unsupported/Eigen/Skyline | 39 - eigen/unsupported/Eigen/SparseExtra | 53 - eigen/unsupported/Eigen/SpecialFunctions | 63 - eigen/unsupported/Eigen/Splines | 31 - .../Eigen/src/AutoDiff/AutoDiffJacobian.h | 108 - .../Eigen/src/AutoDiff/AutoDiffScalar.h | 694 ----- .../Eigen/src/AutoDiff/AutoDiffVector.h | 220 -- eigen/unsupported/Eigen/src/BVH/BVAlgorithms.h | 293 -- eigen/unsupported/Eigen/src/BVH/KdBVH.h | 223 -- .../src/Eigenvalues/ArpackSelfAdjointEigenSolver.h | 805 ----- .../Eigen/src/EulerAngles/CMakeLists.txt | 6 - .../Eigen/src/EulerAngles/EulerAngles.h | 386 --- .../Eigen/src/EulerAngles/EulerSystem.h | 326 -- eigen/unsupported/Eigen/src/FFT/ei_fftw_impl.h | 261 -- eigen/unsupported/Eigen/src/FFT/ei_kissfft_impl.h | 420 --- .../src/IterativeSolvers/ConstrainedConjGrad.h | 189 -- .../Eigen/src/IterativeSolvers/DGMRES.h | 510 ---- .../unsupported/Eigen/src/IterativeSolvers/GMRES.h | 343 --- .../Eigen/src/IterativeSolvers/IncompleteLU.h | 90 - .../src/IterativeSolvers/IterationController.h | 154 - .../Eigen/src/IterativeSolvers/MINRES.h | 289 -- .../Eigen/src/IterativeSolvers/Scaling.h | 187 -- .../src/KroneckerProduct/KroneckerTensorProduct.h | 305 -- .../src/LevenbergMarquardt/CopyrightMINPACK.txt | 52 - .../Eigen/src/LevenbergMarquardt/LMcovar.h | 84 - .../Eigen/src/LevenbergMarquardt/LMonestep.h | 202 -- .../Eigen/src/LevenbergMarquardt/LMpar.h | 160 - .../Eigen/src/LevenbergMarquardt/LMqrsolv.h | 188 -- .../src/LevenbergMarquardt/LevenbergMarquardt.h | 396 --- .../Eigen/src/MatrixFunctions/MatrixExponential.h | 442 --- .../Eigen/src/MatrixFunctions/MatrixFunction.h | 580 ---- .../Eigen/src/MatrixFunctions/MatrixLogarithm.h | 373 --- .../Eigen/src/MatrixFunctions/MatrixPower.h | 709 ----- .../Eigen/src/MatrixFunctions/MatrixSquareRoot.h | 366 --- .../Eigen/src/MatrixFunctions/StemFunction.h | 117 - .../Eigen/src/MoreVectorization/MathFunctions.h | 95 - .../NonLinearOptimization/HybridNonLinearSolver.h | 601 ---- .../src/NonLinearOptimization/LevenbergMarquardt.h | 657 ----- .../Eigen/src/NonLinearOptimization/chkder.h | 66 - .../Eigen/src/NonLinearOptimization/covar.h | 70 - .../Eigen/src/NonLinearOptimization/dogleg.h | 107 - .../Eigen/src/NonLinearOptimization/fdjac1.h | 79 - .../Eigen/src/NonLinearOptimization/lmpar.h | 298 -- .../Eigen/src/NonLinearOptimization/qrsolv.h | 91 - .../Eigen/src/NonLinearOptimization/r1mpyq.h | 30 - .../Eigen/src/NonLinearOptimization/r1updt.h | 99 - .../Eigen/src/NonLinearOptimization/rwupdt.h | 49 - .../Eigen/src/NumericalDiff/NumericalDiff.h | 130 - .../unsupported/Eigen/src/Polynomials/Companion.h | 276 -- .../Eigen/src/Polynomials/PolynomialSolver.h | 406 --- .../Eigen/src/Polynomials/PolynomialUtils.h | 143 - .../Eigen/src/Skyline/SkylineInplaceLU.h | 352 --- .../unsupported/Eigen/src/Skyline/SkylineMatrix.h | 862 ------ .../Eigen/src/Skyline/SkylineMatrixBase.h | 212 -- .../unsupported/Eigen/src/Skyline/SkylineProduct.h | 295 -- .../unsupported/Eigen/src/Skyline/SkylineStorage.h | 259 -- eigen/unsupported/Eigen/src/Skyline/SkylineUtil.h | 89 - .../src/SparseExtra/BlockOfDynamicSparseMatrix.h | 122 - .../Eigen/src/SparseExtra/BlockSparseMatrix.h | 1079 ------- .../Eigen/src/SparseExtra/DynamicSparseMatrix.h | 404 --- eigen/unsupported/Eigen/src/SparseExtra/MarketIO.h | 275 -- .../Eigen/src/SparseExtra/MatrixMarketIterator.h | 247 -- .../Eigen/src/SparseExtra/RandomSetter.h | 327 --- .../SpecialFunctions/SpecialFunctionsArrayAPI.h | 124 - .../SpecialFunctions/SpecialFunctionsFunctors.h | 236 -- .../src/SpecialFunctions/SpecialFunctionsHalf.h | 47 - .../src/SpecialFunctions/SpecialFunctionsImpl.h | 1565 ---------- .../SpecialFunctions/SpecialFunctionsPacketMath.h | 58 - .../arch/CUDA/CudaSpecialFunctions.h | 165 -- eigen/unsupported/Eigen/src/Splines/Spline.h | 507 ---- .../unsupported/Eigen/src/Splines/SplineFitting.h | 430 --- eigen/unsupported/Eigen/src/Splines/SplineFwd.h | 93 - eigen/unsupported/README.txt | 50 - eigen/unsupported/bench/bench_svd.cpp | 123 - eigen/unsupported/doc/CMakeLists.txt | 4 - eigen/unsupported/doc/Overview.dox | 28 - eigen/unsupported/doc/eigendoxy_layout.xml.in | 177 -- eigen/unsupported/doc/examples/BVH_Example.cpp | 50 - eigen/unsupported/doc/examples/CMakeLists.txt | 20 - eigen/unsupported/doc/examples/EulerAngles.cpp | 46 - eigen/unsupported/doc/examples/FFT.cpp | 118 - .../unsupported/doc/examples/MatrixExponential.cpp | 16 - eigen/unsupported/doc/examples/MatrixFunction.cpp | 23 - eigen/unsupported/doc/examples/MatrixLogarithm.cpp | 15 - eigen/unsupported/doc/examples/MatrixPower.cpp | 16 - .../doc/examples/MatrixPower_optimal.cpp | 17 - eigen/unsupported/doc/examples/MatrixSine.cpp | 20 - eigen/unsupported/doc/examples/MatrixSinh.cpp | 20 - .../unsupported/doc/examples/MatrixSquareRoot.cpp | 16 - .../unsupported/doc/examples/PolynomialSolver1.cpp | 53 - .../unsupported/doc/examples/PolynomialUtils1.cpp | 20 - eigen/unsupported/doc/snippets/CMakeLists.txt | 26 - eigen/unsupported/test/BVH.cpp | 222 -- eigen/unsupported/test/CMakeLists.txt | 263 -- eigen/unsupported/test/EulerAngles.cpp | 208 -- eigen/unsupported/test/FFT.cpp | 2 - eigen/unsupported/test/FFTW.cpp | 262 -- eigen/unsupported/test/NonLinearOptimization.cpp | 1878 ------------ eigen/unsupported/test/NumericalDiff.cpp | 114 - eigen/unsupported/test/alignedvector3.cpp | 84 - eigen/unsupported/test/autodiff.cpp | 371 --- eigen/unsupported/test/autodiff_scalar.cpp | 101 - eigen/unsupported/test/cxx11_eventcount.cpp | 142 - eigen/unsupported/test/cxx11_meta.cpp | 357 --- .../test/cxx11_non_blocking_thread_pool.cpp | 107 - eigen/unsupported/test/cxx11_runqueue.cpp | 235 -- eigen/unsupported/test/cxx11_tensor_argmax.cpp | 294 -- eigen/unsupported/test/cxx11_tensor_argmax_cuda.cu | 251 -- eigen/unsupported/test/cxx11_tensor_assign.cpp | 370 --- .../test/cxx11_tensor_broadcast_sycl.cpp | 74 - .../unsupported/test/cxx11_tensor_broadcasting.cpp | 194 -- .../test/cxx11_tensor_cast_float16_cuda.cu | 79 - eigen/unsupported/test/cxx11_tensor_casts.cpp | 115 - eigen/unsupported/test/cxx11_tensor_chipping.cpp | 425 --- .../unsupported/test/cxx11_tensor_comparisons.cpp | 84 - .../unsupported/test/cxx11_tensor_complex_cuda.cu | 150 - .../test/cxx11_tensor_complex_cwise_ops_cuda.cu | 94 - .../test/cxx11_tensor_concatenation.cpp | 137 - eigen/unsupported/test/cxx11_tensor_const.cpp | 62 - .../unsupported/test/cxx11_tensor_contract_cuda.cu | 213 -- .../unsupported/test/cxx11_tensor_contraction.cpp | 545 ---- .../unsupported/test/cxx11_tensor_convolution.cpp | 149 - eigen/unsupported/test/cxx11_tensor_cuda.cu | 1284 -------- .../unsupported/test/cxx11_tensor_custom_index.cpp | 100 - eigen/unsupported/test/cxx11_tensor_custom_op.cpp | 111 - eigen/unsupported/test/cxx11_tensor_device.cu | 387 --- .../unsupported/test/cxx11_tensor_device_sycl.cpp | 31 - eigen/unsupported/test/cxx11_tensor_dimension.cpp | 69 - eigen/unsupported/test/cxx11_tensor_empty.cpp | 40 - eigen/unsupported/test/cxx11_tensor_expr.cpp | 314 -- eigen/unsupported/test/cxx11_tensor_fft.cpp | 273 -- eigen/unsupported/test/cxx11_tensor_fixed_size.cpp | 261 -- .../unsupported/test/cxx11_tensor_forced_eval.cpp | 79 - .../test/cxx11_tensor_forced_eval_sycl.cpp | 70 - eigen/unsupported/test/cxx11_tensor_generator.cpp | 91 - eigen/unsupported/test/cxx11_tensor_ifft.cpp | 154 - .../unsupported/test/cxx11_tensor_image_patch.cpp | 757 ----- eigen/unsupported/test/cxx11_tensor_index_list.cpp | 386 --- eigen/unsupported/test/cxx11_tensor_inflation.cpp | 81 - eigen/unsupported/test/cxx11_tensor_intdiv.cpp | 147 - eigen/unsupported/test/cxx11_tensor_io.cpp | 136 - .../unsupported/test/cxx11_tensor_layout_swap.cpp | 61 - eigen/unsupported/test/cxx11_tensor_lvalue.cpp | 42 - eigen/unsupported/test/cxx11_tensor_map.cpp | 277 -- eigen/unsupported/test/cxx11_tensor_math.cpp | 46 - .../test/cxx11_tensor_mixed_indices.cpp | 53 - eigen/unsupported/test/cxx11_tensor_morphing.cpp | 485 --- .../unsupported/test/cxx11_tensor_notification.cpp | 81 - eigen/unsupported/test/cxx11_tensor_of_complex.cpp | 103 - .../test/cxx11_tensor_of_const_values.cpp | 105 - .../test/cxx11_tensor_of_float16_cuda.cu | 491 ---- eigen/unsupported/test/cxx11_tensor_of_strings.cpp | 152 - eigen/unsupported/test/cxx11_tensor_padding.cpp | 93 - eigen/unsupported/test/cxx11_tensor_patch.cpp | 172 -- eigen/unsupported/test/cxx11_tensor_random.cpp | 78 - eigen/unsupported/test/cxx11_tensor_random_cuda.cu | 85 - eigen/unsupported/test/cxx11_tensor_reduction.cpp | 508 ---- .../test/cxx11_tensor_reduction_cuda.cu | 154 - .../test/cxx11_tensor_reduction_sycl.cpp | 138 - eigen/unsupported/test/cxx11_tensor_ref.cpp | 248 -- eigen/unsupported/test/cxx11_tensor_reverse.cpp | 190 -- eigen/unsupported/test/cxx11_tensor_roundings.cpp | 62 - eigen/unsupported/test/cxx11_tensor_scan.cpp | 110 - eigen/unsupported/test/cxx11_tensor_scan_cuda.cu | 76 - eigen/unsupported/test/cxx11_tensor_shuffling.cpp | 228 -- eigen/unsupported/test/cxx11_tensor_simple.cpp | 327 --- eigen/unsupported/test/cxx11_tensor_striding.cpp | 119 - eigen/unsupported/test/cxx11_tensor_sugar.cpp | 81 - eigen/unsupported/test/cxx11_tensor_sycl.cpp | 159 - eigen/unsupported/test/cxx11_tensor_symmetry.cpp | 818 ------ .../unsupported/test/cxx11_tensor_thread_pool.cpp | 373 --- eigen/unsupported/test/cxx11_tensor_uint128.cpp | 160 - .../unsupported/test/cxx11_tensor_volume_patch.cpp | 112 - eigen/unsupported/test/dgmres.cpp | 31 - eigen/unsupported/test/forward_adolc.cpp | 141 - eigen/unsupported/test/gmres.cpp | 31 - eigen/unsupported/test/kronecker_product.cpp | 252 -- eigen/unsupported/test/levenberg_marquardt.cpp | 1477 ---------- eigen/unsupported/test/matrix_exponential.cpp | 141 - eigen/unsupported/test/matrix_function.cpp | 189 -- eigen/unsupported/test/matrix_functions.h | 67 - eigen/unsupported/test/matrix_power.cpp | 204 -- eigen/unsupported/test/matrix_square_root.cpp | 31 - eigen/unsupported/test/minres.cpp | 44 - eigen/unsupported/test/mpreal/mpreal.h | 3104 -------------------- eigen/unsupported/test/mpreal_support.cpp | 65 - eigen/unsupported/test/openglsupport.cpp | 333 --- eigen/unsupported/test/polynomialsolver.cpp | 216 -- eigen/unsupported/test/polynomialutils.cpp | 113 - eigen/unsupported/test/sparse_extra.cpp | 147 - eigen/unsupported/test/special_functions.cpp | 345 --- eigen/unsupported/test/splines.cpp | 281 -- 302 files changed, 79962 deletions(-) delete mode 100644 eigen/unsupported/CMakeLists.txt delete mode 100644 eigen/unsupported/Eigen/AdolcForward delete mode 100644 eigen/unsupported/Eigen/AlignedVector3 delete mode 100644 eigen/unsupported/Eigen/ArpackSupport delete mode 100644 eigen/unsupported/Eigen/AutoDiff delete mode 100644 eigen/unsupported/Eigen/BVH delete mode 100644 eigen/unsupported/Eigen/CMakeLists.txt delete mode 100644 eigen/unsupported/Eigen/CXX11/CMakeLists.txt delete mode 100644 eigen/unsupported/Eigen/CXX11/Tensor delete mode 100644 eigen/unsupported/Eigen/CXX11/TensorSymmetry delete mode 100644 eigen/unsupported/Eigen/CXX11/ThreadPool delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/README.md delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/Tensor.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/util/EmulateArray.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h delete mode 100644 eigen/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h delete mode 100644 eigen/unsupported/Eigen/EulerAngles delete mode 100644 eigen/unsupported/Eigen/FFT delete mode 100644 eigen/unsupported/Eigen/IterativeSolvers delete mode 100644 eigen/unsupported/Eigen/KroneckerProduct delete mode 100644 eigen/unsupported/Eigen/LevenbergMarquardt delete mode 100644 eigen/unsupported/Eigen/MPRealSupport delete mode 100644 eigen/unsupported/Eigen/MatrixFunctions delete mode 100644 eigen/unsupported/Eigen/MoreVectorization delete mode 100644 eigen/unsupported/Eigen/NonLinearOptimization delete mode 100644 eigen/unsupported/Eigen/NumericalDiff delete mode 100644 eigen/unsupported/Eigen/OpenGLSupport delete mode 100644 eigen/unsupported/Eigen/Polynomials delete mode 100644 eigen/unsupported/Eigen/Skyline delete mode 100644 eigen/unsupported/Eigen/SparseExtra delete mode 100644 eigen/unsupported/Eigen/SpecialFunctions delete mode 100644 eigen/unsupported/Eigen/Splines delete mode 100644 eigen/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h delete mode 100644 eigen/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h delete mode 100644 eigen/unsupported/Eigen/src/AutoDiff/AutoDiffVector.h delete mode 100644 eigen/unsupported/Eigen/src/BVH/BVAlgorithms.h delete mode 100644 eigen/unsupported/Eigen/src/BVH/KdBVH.h delete mode 100644 eigen/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h delete mode 100644 eigen/unsupported/Eigen/src/EulerAngles/CMakeLists.txt delete mode 100644 eigen/unsupported/Eigen/src/EulerAngles/EulerAngles.h delete mode 100644 eigen/unsupported/Eigen/src/EulerAngles/EulerSystem.h delete mode 100644 eigen/unsupported/Eigen/src/FFT/ei_fftw_impl.h delete mode 100644 eigen/unsupported/Eigen/src/FFT/ei_kissfft_impl.h delete mode 100644 eigen/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h delete mode 100644 eigen/unsupported/Eigen/src/IterativeSolvers/DGMRES.h delete mode 100644 eigen/unsupported/Eigen/src/IterativeSolvers/GMRES.h delete mode 100644 eigen/unsupported/Eigen/src/IterativeSolvers/IncompleteLU.h delete mode 100644 eigen/unsupported/Eigen/src/IterativeSolvers/IterationController.h delete mode 100644 eigen/unsupported/Eigen/src/IterativeSolvers/MINRES.h delete mode 100644 eigen/unsupported/Eigen/src/IterativeSolvers/Scaling.h delete mode 100644 eigen/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h delete mode 100644 eigen/unsupported/Eigen/src/LevenbergMarquardt/CopyrightMINPACK.txt delete mode 100644 eigen/unsupported/Eigen/src/LevenbergMarquardt/LMcovar.h delete mode 100644 eigen/unsupported/Eigen/src/LevenbergMarquardt/LMonestep.h delete mode 100644 eigen/unsupported/Eigen/src/LevenbergMarquardt/LMpar.h delete mode 100644 eigen/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h delete mode 100644 eigen/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h delete mode 100644 eigen/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h delete mode 100644 eigen/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h delete mode 100644 eigen/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h delete mode 100644 eigen/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h delete mode 100644 eigen/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h delete mode 100644 eigen/unsupported/Eigen/src/MatrixFunctions/StemFunction.h delete mode 100644 eigen/unsupported/Eigen/src/MoreVectorization/MathFunctions.h delete mode 100644 eigen/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h delete mode 100644 eigen/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h delete mode 100644 eigen/unsupported/Eigen/src/NonLinearOptimization/chkder.h delete mode 100644 eigen/unsupported/Eigen/src/NonLinearOptimization/covar.h delete mode 100644 eigen/unsupported/Eigen/src/NonLinearOptimization/dogleg.h delete mode 100644 eigen/unsupported/Eigen/src/NonLinearOptimization/fdjac1.h delete mode 100644 eigen/unsupported/Eigen/src/NonLinearOptimization/lmpar.h delete mode 100644 eigen/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h delete mode 100644 eigen/unsupported/Eigen/src/NonLinearOptimization/r1mpyq.h delete mode 100644 eigen/unsupported/Eigen/src/NonLinearOptimization/r1updt.h delete mode 100644 eigen/unsupported/Eigen/src/NonLinearOptimization/rwupdt.h delete mode 100644 eigen/unsupported/Eigen/src/NumericalDiff/NumericalDiff.h delete mode 100644 eigen/unsupported/Eigen/src/Polynomials/Companion.h delete mode 100644 eigen/unsupported/Eigen/src/Polynomials/PolynomialSolver.h delete mode 100644 eigen/unsupported/Eigen/src/Polynomials/PolynomialUtils.h delete mode 100644 eigen/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h delete mode 100644 eigen/unsupported/Eigen/src/Skyline/SkylineMatrix.h delete mode 100644 eigen/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h delete mode 100644 eigen/unsupported/Eigen/src/Skyline/SkylineProduct.h delete mode 100644 eigen/unsupported/Eigen/src/Skyline/SkylineStorage.h delete mode 100644 eigen/unsupported/Eigen/src/Skyline/SkylineUtil.h delete mode 100644 eigen/unsupported/Eigen/src/SparseExtra/BlockOfDynamicSparseMatrix.h delete mode 100644 eigen/unsupported/Eigen/src/SparseExtra/BlockSparseMatrix.h delete mode 100644 eigen/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h delete mode 100644 eigen/unsupported/Eigen/src/SparseExtra/MarketIO.h delete mode 100644 eigen/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h delete mode 100644 eigen/unsupported/Eigen/src/SparseExtra/RandomSetter.h delete mode 100644 eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h delete mode 100644 eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h delete mode 100644 eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h delete mode 100644 eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h delete mode 100644 eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h delete mode 100644 eigen/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h delete mode 100644 eigen/unsupported/Eigen/src/Splines/Spline.h delete mode 100644 eigen/unsupported/Eigen/src/Splines/SplineFitting.h delete mode 100644 eigen/unsupported/Eigen/src/Splines/SplineFwd.h delete mode 100644 eigen/unsupported/README.txt delete mode 100644 eigen/unsupported/bench/bench_svd.cpp delete mode 100644 eigen/unsupported/doc/CMakeLists.txt delete mode 100644 eigen/unsupported/doc/Overview.dox delete mode 100644 eigen/unsupported/doc/eigendoxy_layout.xml.in delete mode 100644 eigen/unsupported/doc/examples/BVH_Example.cpp delete mode 100644 eigen/unsupported/doc/examples/CMakeLists.txt delete mode 100644 eigen/unsupported/doc/examples/EulerAngles.cpp delete mode 100644 eigen/unsupported/doc/examples/FFT.cpp delete mode 100644 eigen/unsupported/doc/examples/MatrixExponential.cpp delete mode 100644 eigen/unsupported/doc/examples/MatrixFunction.cpp delete mode 100644 eigen/unsupported/doc/examples/MatrixLogarithm.cpp delete mode 100644 eigen/unsupported/doc/examples/MatrixPower.cpp delete mode 100644 eigen/unsupported/doc/examples/MatrixPower_optimal.cpp delete mode 100644 eigen/unsupported/doc/examples/MatrixSine.cpp delete mode 100644 eigen/unsupported/doc/examples/MatrixSinh.cpp delete mode 100644 eigen/unsupported/doc/examples/MatrixSquareRoot.cpp delete mode 100644 eigen/unsupported/doc/examples/PolynomialSolver1.cpp delete mode 100644 eigen/unsupported/doc/examples/PolynomialUtils1.cpp delete mode 100644 eigen/unsupported/doc/snippets/CMakeLists.txt delete mode 100644 eigen/unsupported/test/BVH.cpp delete mode 100644 eigen/unsupported/test/CMakeLists.txt delete mode 100644 eigen/unsupported/test/EulerAngles.cpp delete mode 100644 eigen/unsupported/test/FFT.cpp delete mode 100644 eigen/unsupported/test/FFTW.cpp delete mode 100644 eigen/unsupported/test/NonLinearOptimization.cpp delete mode 100644 eigen/unsupported/test/NumericalDiff.cpp delete mode 100644 eigen/unsupported/test/alignedvector3.cpp delete mode 100644 eigen/unsupported/test/autodiff.cpp delete mode 100644 eigen/unsupported/test/autodiff_scalar.cpp delete mode 100644 eigen/unsupported/test/cxx11_eventcount.cpp delete mode 100644 eigen/unsupported/test/cxx11_meta.cpp delete mode 100644 eigen/unsupported/test/cxx11_non_blocking_thread_pool.cpp delete mode 100644 eigen/unsupported/test/cxx11_runqueue.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_argmax.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_argmax_cuda.cu delete mode 100644 eigen/unsupported/test/cxx11_tensor_assign.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_broadcast_sycl.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_broadcasting.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_cast_float16_cuda.cu delete mode 100644 eigen/unsupported/test/cxx11_tensor_casts.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_chipping.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_comparisons.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_complex_cuda.cu delete mode 100644 eigen/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu delete mode 100644 eigen/unsupported/test/cxx11_tensor_concatenation.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_const.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_contract_cuda.cu delete mode 100644 eigen/unsupported/test/cxx11_tensor_contraction.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_convolution.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_cuda.cu delete mode 100644 eigen/unsupported/test/cxx11_tensor_custom_index.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_custom_op.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_device.cu delete mode 100644 eigen/unsupported/test/cxx11_tensor_device_sycl.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_dimension.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_empty.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_expr.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_fft.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_fixed_size.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_forced_eval.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_generator.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_ifft.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_image_patch.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_index_list.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_inflation.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_intdiv.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_io.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_layout_swap.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_lvalue.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_map.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_math.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_mixed_indices.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_morphing.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_notification.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_of_complex.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_of_const_values.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_of_float16_cuda.cu delete mode 100644 eigen/unsupported/test/cxx11_tensor_of_strings.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_padding.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_patch.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_random.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_random_cuda.cu delete mode 100644 eigen/unsupported/test/cxx11_tensor_reduction.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_reduction_cuda.cu delete mode 100644 eigen/unsupported/test/cxx11_tensor_reduction_sycl.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_ref.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_reverse.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_roundings.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_scan.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_scan_cuda.cu delete mode 100644 eigen/unsupported/test/cxx11_tensor_shuffling.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_simple.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_striding.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_sugar.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_sycl.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_symmetry.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_thread_pool.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_uint128.cpp delete mode 100644 eigen/unsupported/test/cxx11_tensor_volume_patch.cpp delete mode 100644 eigen/unsupported/test/dgmres.cpp delete mode 100644 eigen/unsupported/test/forward_adolc.cpp delete mode 100644 eigen/unsupported/test/gmres.cpp delete mode 100644 eigen/unsupported/test/kronecker_product.cpp delete mode 100644 eigen/unsupported/test/levenberg_marquardt.cpp delete mode 100644 eigen/unsupported/test/matrix_exponential.cpp delete mode 100644 eigen/unsupported/test/matrix_function.cpp delete mode 100644 eigen/unsupported/test/matrix_functions.h delete mode 100644 eigen/unsupported/test/matrix_power.cpp delete mode 100644 eigen/unsupported/test/matrix_square_root.cpp delete mode 100644 eigen/unsupported/test/minres.cpp delete mode 100644 eigen/unsupported/test/mpreal/mpreal.h delete mode 100644 eigen/unsupported/test/mpreal_support.cpp delete mode 100644 eigen/unsupported/test/openglsupport.cpp delete mode 100644 eigen/unsupported/test/polynomialsolver.cpp delete mode 100644 eigen/unsupported/test/polynomialutils.cpp delete mode 100644 eigen/unsupported/test/sparse_extra.cpp delete mode 100644 eigen/unsupported/test/special_functions.cpp delete mode 100644 eigen/unsupported/test/splines.cpp (limited to 'eigen/unsupported') diff --git a/eigen/unsupported/CMakeLists.txt b/eigen/unsupported/CMakeLists.txt deleted file mode 100644 index 9a56661..0000000 --- a/eigen/unsupported/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -add_subdirectory(Eigen) -add_subdirectory(doc EXCLUDE_FROM_ALL) -if(BUILD_TESTING) - if(EIGEN_LEAVE_TEST_IN_ALL_TARGET) - add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest - else() - add_subdirectory(test EXCLUDE_FROM_ALL) - endif() -endif() diff --git a/eigen/unsupported/Eigen/AdolcForward b/eigen/unsupported/Eigen/AdolcForward deleted file mode 100644 index 15f5f07..0000000 --- a/eigen/unsupported/Eigen/AdolcForward +++ /dev/null @@ -1,156 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2008-2009 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_ADLOC_FORWARD -#define EIGEN_ADLOC_FORWARD - -//-------------------------------------------------------------------------------- -// -// This file provides support for adolc's adouble type in forward mode. -// ADOL-C is a C++ automatic differentiation library, -// see https://projects.coin-or.org/ADOL-C for more information. -// -// Note that the maximal number of directions is controlled by -// the preprocessor token NUMBER_DIRECTIONS. The default is 2. -// -//-------------------------------------------------------------------------------- - -#define ADOLC_TAPELESS -#ifndef NUMBER_DIRECTIONS -# define NUMBER_DIRECTIONS 2 -#endif -#include - -// adolc defines some very stupid macros: -#if defined(malloc) -# undef malloc -#endif - -#if defined(calloc) -# undef calloc -#endif - -#if defined(realloc) -# undef realloc -#endif - -#include - -namespace Eigen { - -/** - * \defgroup AdolcForward_Module Adolc forward module - * This module provides support for adolc's adouble type in forward mode. - * ADOL-C is a C++ automatic differentiation library, - * see https://projects.coin-or.org/ADOL-C for more information. - * It mainly consists in: - * - a struct Eigen::NumTraits specialization - * - overloads of internal::* math function for adtl::adouble type. - * - * Note that the maximal number of directions is controlled by - * the preprocessor token NUMBER_DIRECTIONS. The default is 2. - * - * \code - * #include - * \endcode - */ - //@{ - -} // namespace Eigen - -// Eigen's require a few additional functions which must be defined in the same namespace -// than the custom scalar type own namespace -namespace adtl { - -inline const adouble& conj(const adouble& x) { return x; } -inline const adouble& real(const adouble& x) { return x; } -inline adouble imag(const adouble&) { return 0.; } -inline adouble abs(const adouble& x) { return fabs(x); } -inline adouble abs2(const adouble& x) { return x*x; } - -} - -namespace Eigen { - -template<> struct NumTraits - : NumTraits -{ - typedef adtl::adouble Real; - typedef adtl::adouble NonInteger; - typedef adtl::adouble Nested; - enum { - IsComplex = 0, - IsInteger = 0, - IsSigned = 1, - RequireInitialization = 1, - ReadCost = 1, - AddCost = 1, - MulCost = 1 - }; -}; - -template class AdolcForwardJacobian : public Functor -{ - typedef adtl::adouble ActiveScalar; -public: - - AdolcForwardJacobian() : Functor() {} - AdolcForwardJacobian(const Functor& f) : Functor(f) {} - - // forward constructors - template - AdolcForwardJacobian(const T0& a0) : Functor(a0) {} - template - AdolcForwardJacobian(const T0& a0, const T1& a1) : Functor(a0, a1) {} - template - AdolcForwardJacobian(const T0& a0, const T1& a1, const T1& a2) : Functor(a0, a1, a2) {} - - typedef typename Functor::InputType InputType; - typedef typename Functor::ValueType ValueType; - typedef typename Functor::JacobianType JacobianType; - - typedef Matrix ActiveInput; - typedef Matrix ActiveValue; - - void operator() (const InputType& x, ValueType* v, JacobianType* _jac) const - { - eigen_assert(v!=0); - if (!_jac) - { - Functor::operator()(x, v); - return; - } - - JacobianType& jac = *_jac; - - ActiveInput ax = x.template cast(); - ActiveValue av(jac.rows()); - - for (int j=0; j -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_ALIGNED_VECTOR3 -#define EIGEN_ALIGNED_VECTOR3 - -#include - -namespace Eigen { - -/** - * \defgroup AlignedVector3_Module Aligned vector3 module - * - * \code - * #include - * \endcode - */ - //@{ - - -/** \class AlignedVector3 - * - * \brief A vectorization friendly 3D vector - * - * This class represents a 3D vector internally using a 4D vector - * such that vectorization can be seamlessly enabled. Of course, - * the same result can be achieved by directly using a 4D vector. - * This class makes this process simpler. - * - */ -// TODO specialize Cwise -template class AlignedVector3; - -namespace internal { -template struct traits > - : traits > -{ -}; -} - -template class AlignedVector3 - : public MatrixBase > -{ - typedef Matrix<_Scalar,4,1> CoeffType; - CoeffType m_coeffs; - public: - - typedef MatrixBase > Base; - EIGEN_DENSE_PUBLIC_INTERFACE(AlignedVector3) - using Base::operator*; - - inline Index rows() const { return 3; } - inline Index cols() const { return 1; } - - Scalar* data() { return m_coeffs.data(); } - const Scalar* data() const { return m_coeffs.data(); } - Index innerStride() const { return 1; } - Index outerStride() const { return 3; } - - inline const Scalar& coeff(Index row, Index col) const - { return m_coeffs.coeff(row, col); } - - inline Scalar& coeffRef(Index row, Index col) - { return m_coeffs.coeffRef(row, col); } - - inline const Scalar& coeff(Index index) const - { return m_coeffs.coeff(index); } - - inline Scalar& coeffRef(Index index) - { return m_coeffs.coeffRef(index);} - - - inline AlignedVector3(const Scalar& x, const Scalar& y, const Scalar& z) - : m_coeffs(x, y, z, Scalar(0)) - {} - - inline AlignedVector3(const AlignedVector3& other) - : Base(), m_coeffs(other.m_coeffs) - {} - - template - struct generic_assign_selector {}; - - template struct generic_assign_selector - { - inline static void run(AlignedVector3& dest, const XprType& src) - { - dest.m_coeffs = src; - } - }; - - template struct generic_assign_selector - { - inline static void run(AlignedVector3& dest, const XprType& src) - { - dest.m_coeffs.template head<3>() = src; - dest.m_coeffs.w() = Scalar(0); - } - }; - - template - inline AlignedVector3(const MatrixBase& other) - { - generic_assign_selector::run(*this,other.derived()); - } - - inline AlignedVector3& operator=(const AlignedVector3& other) - { m_coeffs = other.m_coeffs; return *this; } - - template - inline AlignedVector3& operator=(const MatrixBase& other) - { - generic_assign_selector::run(*this,other.derived()); - return *this; - } - - inline AlignedVector3 operator+(const AlignedVector3& other) const - { return AlignedVector3(m_coeffs + other.m_coeffs); } - - inline AlignedVector3& operator+=(const AlignedVector3& other) - { m_coeffs += other.m_coeffs; return *this; } - - inline AlignedVector3 operator-(const AlignedVector3& other) const - { return AlignedVector3(m_coeffs - other.m_coeffs); } - - inline AlignedVector3 operator-=(const AlignedVector3& other) - { m_coeffs -= other.m_coeffs; return *this; } - - inline AlignedVector3 operator*(const Scalar& s) const - { return AlignedVector3(m_coeffs * s); } - - inline friend AlignedVector3 operator*(const Scalar& s,const AlignedVector3& vec) - { return AlignedVector3(s * vec.m_coeffs); } - - inline AlignedVector3& operator*=(const Scalar& s) - { m_coeffs *= s; return *this; } - - inline AlignedVector3 operator/(const Scalar& s) const - { return AlignedVector3(m_coeffs / s); } - - inline AlignedVector3& operator/=(const Scalar& s) - { m_coeffs /= s; return *this; } - - inline Scalar dot(const AlignedVector3& other) const - { - eigen_assert(m_coeffs.w()==Scalar(0)); - eigen_assert(other.m_coeffs.w()==Scalar(0)); - return m_coeffs.dot(other.m_coeffs); - } - - inline void normalize() - { - m_coeffs /= norm(); - } - - inline AlignedVector3 normalized() const - { - return AlignedVector3(m_coeffs / norm()); - } - - inline Scalar sum() const - { - eigen_assert(m_coeffs.w()==Scalar(0)); - return m_coeffs.sum(); - } - - inline Scalar squaredNorm() const - { - eigen_assert(m_coeffs.w()==Scalar(0)); - return m_coeffs.squaredNorm(); - } - - inline Scalar norm() const - { - using std::sqrt; - return sqrt(squaredNorm()); - } - - inline AlignedVector3 cross(const AlignedVector3& other) const - { - return AlignedVector3(m_coeffs.cross3(other.m_coeffs)); - } - - template - inline bool isApprox(const MatrixBase& other, const RealScalar& eps=NumTraits::dummy_precision()) const - { - return m_coeffs.template head<3>().isApprox(other,eps); - } - - CoeffType& coeffs() { return m_coeffs; } - const CoeffType& coeffs() const { return m_coeffs; } -}; - -namespace internal { - -template -struct eval, Dense> -{ - typedef const AlignedVector3<_Scalar>& type; -}; - -template -struct evaluator > - : evaluator > -{ - typedef AlignedVector3 XprType; - typedef evaluator > Base; - - evaluator(const XprType &m) : Base(m.coeffs()) {} -}; - -} - -//@} - -} - -#endif // EIGEN_ALIGNED_VECTOR3 diff --git a/eigen/unsupported/Eigen/ArpackSupport b/eigen/unsupported/Eigen/ArpackSupport deleted file mode 100644 index 37a2799..0000000 --- a/eigen/unsupported/Eigen/ArpackSupport +++ /dev/null @@ -1,31 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_ARPACKSUPPORT_MODULE_H -#define EIGEN_ARPACKSUPPORT_MODULE_H - -#include - -#include - -/** \defgroup ArpackSupport_Module Arpack support module - * - * This module provides a wrapper to Arpack, a library for sparse eigenvalue decomposition. - * - * \code - * #include - * \endcode - */ - -#include -#include "src/Eigenvalues/ArpackSelfAdjointEigenSolver.h" - -#include - -#endif // EIGEN_ARPACKSUPPORT_MODULE_H -/* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/eigen/unsupported/Eigen/AutoDiff b/eigen/unsupported/Eigen/AutoDiff deleted file mode 100644 index abf5b7d..0000000 --- a/eigen/unsupported/Eigen/AutoDiff +++ /dev/null @@ -1,40 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2008-2009 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_AUTODIFF_MODULE -#define EIGEN_AUTODIFF_MODULE - -namespace Eigen { - -/** - * \defgroup AutoDiff_Module Auto Diff module - * - * This module features forward automatic differentation via a simple - * templated scalar type wrapper AutoDiffScalar. - * - * Warning : this should NOT be confused with numerical differentiation, which - * is a different method and has its own module in Eigen : \ref NumericalDiff_Module. - * - * \code - * #include - * \endcode - */ -//@{ - -} - -#include "src/AutoDiff/AutoDiffScalar.h" -// #include "src/AutoDiff/AutoDiffVector.h" -#include "src/AutoDiff/AutoDiffJacobian.h" - -namespace Eigen { -//@} -} - -#endif // EIGEN_AUTODIFF_MODULE diff --git a/eigen/unsupported/Eigen/BVH b/eigen/unsupported/Eigen/BVH deleted file mode 100644 index 0161a54..0000000 --- a/eigen/unsupported/Eigen/BVH +++ /dev/null @@ -1,95 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2009 Ilya Baran -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_BVH_MODULE_H -#define EIGEN_BVH_MODULE_H - -#include -#include -#include -#include -#include - -namespace Eigen { - -/** - * \defgroup BVH_Module BVH module - * \brief This module provides generic bounding volume hierarchy algorithms - * and reference tree implementations. - * - * - * \code - * #include - * \endcode - * - * A bounding volume hierarchy (BVH) can accelerate many geometric queries. This module provides a generic implementation - * of the two basic algorithms over a BVH: intersection of a query object against all objects in the hierarchy and minimization - * of a function over the objects in the hierarchy. It also provides intersection and minimization over a cartesian product of - * two BVH's. A BVH accelerates intersection by using the fact that if a query object does not intersect a volume, then it cannot - * intersect any object contained in that volume. Similarly, a BVH accelerates minimization because the minimum of a function - * over a volume is no greater than the minimum of a function over any object contained in it. - * - * Some sample queries that can be written in terms of intersection are: - * - Determine all points where a ray intersects a triangle mesh - * - Given a set of points, determine which are contained in a query sphere - * - Given a set of spheres, determine which contain the query point - * - Given a set of disks, determine if any is completely contained in a query rectangle (represent each 2D disk as a point \f$(x,y,r)\f$ - * in 3D and represent the rectangle as a pyramid based on the original rectangle and shrinking in the \f$r\f$ direction) - * - Given a set of points, count how many pairs are \f$d\pm\epsilon\f$ apart (done by looking at the cartesian product of the set - * of points with itself) - * - * Some sample queries that can be written in terms of function minimization over a set of objects are: - * - Find the intersection between a ray and a triangle mesh closest to the ray origin (function is infinite off the ray) - * - Given a polyline and a query point, determine the closest point on the polyline to the query - * - Find the diameter of a point cloud (done by looking at the cartesian product and using negative distance as the function) - * - Determine how far two meshes are from colliding (this is also a cartesian product query) - * - * This implementation decouples the basic algorithms both from the type of hierarchy (and the types of the bounding volumes) and - * from the particulars of the query. To enable abstraction from the BVH, the BVH is required to implement a generic mechanism - * for traversal. To abstract from the query, the query is responsible for keeping track of results. - * - * To be used in the algorithms, a hierarchy must implement the following traversal mechanism (see KdBVH for a sample implementation): \code - typedef Volume //the type of bounding volume - typedef Object //the type of object in the hierarchy - typedef Index //a reference to a node in the hierarchy--typically an int or a pointer - typedef VolumeIterator //an iterator type over node children--returns Index - typedef ObjectIterator //an iterator over object (leaf) children--returns const Object & - Index getRootIndex() const //returns the index of the hierarchy root - const Volume &getVolume(Index index) const //returns the bounding volume of the node at given index - void getChildren(Index index, VolumeIterator &outVBegin, VolumeIterator &outVEnd, - ObjectIterator &outOBegin, ObjectIterator &outOEnd) const - //getChildren takes a node index and makes [outVBegin, outVEnd) range over its node children - //and [outOBegin, outOEnd) range over its object children - \endcode - * - * To use the hierarchy, call BVIntersect or BVMinimize, passing it a BVH (or two, for cartesian product) and a minimizer or intersector. - * For an intersection query on a single BVH, the intersector encapsulates the query and must provide two functions: - * \code - bool intersectVolume(const Volume &volume) //returns true if the query intersects the volume - bool intersectObject(const Object &object) //returns true if the intersection search should terminate immediately - \endcode - * The guarantee that BVIntersect provides is that intersectObject will be called on every object whose bounding volume - * intersects the query (but possibly on other objects too) unless the search is terminated prematurely. It is the - * responsibility of the intersectObject function to keep track of the results in whatever manner is appropriate. - * The cartesian product intersection and the BVMinimize queries are similar--see their individual documentation. - * - * The following is a simple but complete example for how to use the BVH to accelerate the search for a closest red-blue point pair: - * \include BVH_Example.cpp - * Output: \verbinclude BVH_Example.out - */ -} - -//@{ - -#include "src/BVH/BVAlgorithms.h" -#include "src/BVH/KdBVH.h" - -//@} - -#endif // EIGEN_BVH_MODULE_H diff --git a/eigen/unsupported/Eigen/CMakeLists.txt b/eigen/unsupported/Eigen/CMakeLists.txt deleted file mode 100644 index 631a060..0000000 --- a/eigen/unsupported/Eigen/CMakeLists.txt +++ /dev/null @@ -1,32 +0,0 @@ -set(Eigen_HEADERS - AdolcForward - AlignedVector3 - ArpackSupport - AutoDiff - BVH - EulerAngles - FFT - IterativeSolvers - KroneckerProduct - LevenbergMarquardt - MatrixFunctions - MoreVectorization - MPRealSupport - NonLinearOptimization - NumericalDiff - OpenGLSupport - Polynomials - Skyline - SparseExtra - SpecialFunctions - Splines - ) - -install(FILES - ${Eigen_HEADERS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel - ) - -install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel FILES_MATCHING PATTERN "*.h") - -add_subdirectory(CXX11) diff --git a/eigen/unsupported/Eigen/CXX11/CMakeLists.txt b/eigen/unsupported/Eigen/CXX11/CMakeLists.txt deleted file mode 100644 index 385ed24..0000000 --- a/eigen/unsupported/Eigen/CXX11/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -set(Eigen_CXX11_HEADERS Tensor TensorSymmetry ThreadPool) - -install(FILES - ${Eigen_CXX11_HEADERS} - DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel - ) - -install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel FILES_MATCHING PATTERN "*.h") diff --git a/eigen/unsupported/Eigen/CXX11/Tensor b/eigen/unsupported/Eigen/CXX11/Tensor deleted file mode 100644 index bb6523d..0000000 --- a/eigen/unsupported/Eigen/CXX11/Tensor +++ /dev/null @@ -1,154 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// Copyright (C) 2013 Christian Seiler -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -//#ifndef EIGEN_CXX11_TENSOR_MODULE -//#define EIGEN_CXX11_TENSOR_MODULE - -#include "../../../Eigen/Core" - -#ifdef EIGEN_USE_SYCL -#undef min -#undef max -#undef isnan -#undef isinf -#undef isfinite -#include -#include -#include -#include -#endif - -#include - -#include "../SpecialFunctions" -#include "src/util/CXX11Meta.h" -#include "src/util/MaxSizeVector.h" - -/** \defgroup CXX11_Tensor_Module Tensor Module - * - * This module provides a Tensor class for storing arbitrarily indexed - * objects. - * - * \code - * #include - * \endcode - * - * Much of the documentation can be found \ref eigen_tensors "here". - */ - -#include -#include -#include - -#ifdef _WIN32 -typedef __int16 int16_t; -typedef unsigned __int16 uint16_t; -typedef __int32 int32_t; -typedef unsigned __int32 uint32_t; -typedef __int64 int64_t; -typedef unsigned __int64 uint64_t; -#else -#include -#endif - -#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900 -#include -#endif - -#ifdef _WIN32 -#include -#elif defined(__APPLE__) -#include -#else -#include -#endif - -#ifdef EIGEN_USE_THREADS -#include "ThreadPool" -#endif - -#ifdef EIGEN_USE_GPU -#include -#include -#if __cplusplus >= 201103L -#include -#include -#endif -#endif - -#include "src/Tensor/TensorMacros.h" -#include "src/Tensor/TensorForwardDeclarations.h" -#include "src/Tensor/TensorMeta.h" -#include "src/Tensor/TensorFunctors.h" -#include "src/Tensor/TensorCostModel.h" -#include "src/Tensor/TensorDeviceDefault.h" -#include "src/Tensor/TensorDeviceThreadPool.h" -#include "src/Tensor/TensorDeviceCuda.h" -#include "src/Tensor/TensorDeviceSycl.h" -#include "src/Tensor/TensorIndexList.h" -#include "src/Tensor/TensorDimensionList.h" -#include "src/Tensor/TensorDimensions.h" -#include "src/Tensor/TensorInitializer.h" -#include "src/Tensor/TensorTraits.h" -#include "src/Tensor/TensorRandom.h" -#include "src/Tensor/TensorUInt128.h" -#include "src/Tensor/TensorIntDiv.h" -#include "src/Tensor/TensorGlobalFunctions.h" - -#include "src/Tensor/TensorBase.h" - -#include "src/Tensor/TensorEvaluator.h" -#include "src/Tensor/TensorExpr.h" -#include "src/Tensor/TensorReduction.h" -#include "src/Tensor/TensorReductionCuda.h" -#include "src/Tensor/TensorArgMax.h" -#include "src/Tensor/TensorConcatenation.h" -#include "src/Tensor/TensorContractionMapper.h" -#include "src/Tensor/TensorContractionBlocking.h" -#include "src/Tensor/TensorContraction.h" -#include "src/Tensor/TensorContractionThreadPool.h" -#include "src/Tensor/TensorContractionCuda.h" -#include "src/Tensor/TensorConversion.h" -#include "src/Tensor/TensorConvolution.h" -#include "src/Tensor/TensorFFT.h" -#include "src/Tensor/TensorPatch.h" -#include "src/Tensor/TensorImagePatch.h" -#include "src/Tensor/TensorVolumePatch.h" -#include "src/Tensor/TensorBroadcasting.h" -#include "src/Tensor/TensorChipping.h" -#include "src/Tensor/TensorInflation.h" -#include "src/Tensor/TensorLayoutSwap.h" -#include "src/Tensor/TensorMorphing.h" -#include "src/Tensor/TensorPadding.h" -#include "src/Tensor/TensorReverse.h" -#include "src/Tensor/TensorShuffling.h" -#include "src/Tensor/TensorStriding.h" -#include "src/Tensor/TensorCustomOp.h" -#include "src/Tensor/TensorEvalTo.h" -#include "src/Tensor/TensorForcedEval.h" -#include "src/Tensor/TensorGenerator.h" -#include "src/Tensor/TensorAssign.h" -#include "src/Tensor/TensorScan.h" - -#include "src/Tensor/TensorSycl.h" -#include "src/Tensor/TensorExecutor.h" -#include "src/Tensor/TensorDevice.h" - -#include "src/Tensor/TensorStorage.h" -#include "src/Tensor/Tensor.h" -#include "src/Tensor/TensorFixedSize.h" -#include "src/Tensor/TensorMap.h" -#include "src/Tensor/TensorRef.h" - -#include "src/Tensor/TensorIO.h" - -#include - -//#endif // EIGEN_CXX11_TENSOR_MODULE diff --git a/eigen/unsupported/Eigen/CXX11/TensorSymmetry b/eigen/unsupported/Eigen/CXX11/TensorSymmetry deleted file mode 100644 index fb1b0c0..0000000 --- a/eigen/unsupported/Eigen/CXX11/TensorSymmetry +++ /dev/null @@ -1,42 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2013 Christian Seiler -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE -#define EIGEN_CXX11_TENSORSYMMETRY_MODULE - -#include - -#include - -#include "src/util/CXX11Meta.h" - -/** \defgroup CXX11_TensorSymmetry_Module Tensor Symmetry Module - * - * This module provides a classes that allow for the definition of - * symmetries w.r.t. tensor indices. - * - * Including this module will implicitly include the Tensor module. - * - * \code - * #include - * \endcode - */ - -#include "src/TensorSymmetry/util/TemplateGroupTheory.h" -#include "src/TensorSymmetry/Symmetry.h" -#include "src/TensorSymmetry/StaticSymmetry.h" -#include "src/TensorSymmetry/DynamicSymmetry.h" - -#include - -#endif // EIGEN_CXX11_TENSORSYMMETRY_MODULE - -/* - * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; - */ diff --git a/eigen/unsupported/Eigen/CXX11/ThreadPool b/eigen/unsupported/Eigen/CXX11/ThreadPool deleted file mode 100644 index 09d637e..0000000 --- a/eigen/unsupported/Eigen/CXX11/ThreadPool +++ /dev/null @@ -1,65 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_THREADPOOL_MODULE -#define EIGEN_CXX11_THREADPOOL_MODULE - -#include "../../../Eigen/Core" - -#include - -/** \defgroup CXX11_ThreadPool_Module C++11 ThreadPool Module - * - * This module provides 2 threadpool implementations - * - a simple reference implementation - * - a faster non blocking implementation - * - * This module requires C++11. - * - * \code - * #include - * \endcode - */ - - -// The code depends on CXX11, so only include the module if the -// compiler supports it. -#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900 -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "src/util/CXX11Meta.h" -#include "src/util/MaxSizeVector.h" - -#include "src/ThreadPool/ThreadLocal.h" -#include "src/ThreadPool/ThreadYield.h" -#include "src/ThreadPool/EventCount.h" -#include "src/ThreadPool/RunQueue.h" -#include "src/ThreadPool/ThreadPoolInterface.h" -#include "src/ThreadPool/ThreadEnvironment.h" -#include "src/ThreadPool/SimpleThreadPool.h" -#include "src/ThreadPool/NonBlockingThreadPool.h" - -#endif - -#include - -#endif // EIGEN_CXX11_THREADPOOL_MODULE - diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md b/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md deleted file mode 100644 index da70fa2..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +++ /dev/null @@ -1,1760 +0,0 @@ -# Eigen Tensors {#eigen_tensors} - -Tensors are multidimensional arrays of elements. Elements are typically scalars, -but more complex types such as strings are also supported. - -[TOC] - -## Tensor Classes - -You can manipulate a tensor with one of the following classes. They all are in -the namespace `::Eigen.` - - -### Class Tensor - -This is the class to use to create a tensor and allocate memory for it. The -class is templatized with the tensor datatype, such as float or int, and the -tensor rank. The rank is the number of dimensions, for example rank 2 is a -matrix. - -Tensors of this class are resizable. For example, if you assign a tensor of a -different size to a Tensor, that tensor is resized to match its new value. - -#### Constructor `Tensor(size0, size1, ...)` - -Constructor for a Tensor. The constructor must be passed `rank` integers -indicating the sizes of the instance along each of the the `rank` -dimensions. - - // Create a tensor of rank 3 of sizes 2, 3, 4. This tensor owns - // memory to hold 24 floating point values (24 = 2 x 3 x 4). - Tensor t_3d(2, 3, 4); - - // Resize t_3d by assigning a tensor of different sizes, but same rank. - t_3d = Tensor(3, 4, 3); - -#### Constructor `Tensor(size_array)` - -Constructor where the sizes for the constructor are specified as an array of -values instead of an explicitly list of parameters. The array type to use is -`Eigen::array`. The array can be constructed automatically -from an initializer list. - - // Create a tensor of strings of rank 2 with sizes 5, 7. - Tensor t_2d({5, 7}); - - -### Class `TensorFixedSize>` - -Class to use for tensors of fixed size, where the size is known at compile -time. Fixed sized tensors can provide very fast computations because all their -dimensions are known by the compiler. FixedSize tensors are not resizable. - -If the total number of elements in a fixed size tensor is small enough the -tensor data is held onto the stack and does not cause heap allocation and free. - - // Create a 4 x 3 tensor of floats. - TensorFixedSize> t_4x3; - -### Class `TensorMap>` - -This is the class to use to create a tensor on top of memory allocated and -owned by another part of your code. It allows to view any piece of allocated -memory as a Tensor. Instances of this class do not own the memory where the -data are stored. - -A TensorMap is not resizable because it does not own the memory where its data -are stored. - -#### Constructor `TensorMap>(data, size0, size1, ...)` - -Constructor for a Tensor. The constructor must be passed a pointer to the -storage for the data, and "rank" size attributes. The storage has to be -large enough to hold all the data. - - // Map a tensor of ints on top of stack-allocated storage. - int storage[128]; // 2 x 4 x 2 x 8 = 128 - TensorMap> t_4d(storage, 2, 4, 2, 8); - - // The same storage can be viewed as a different tensor. - // You can also pass the sizes as an array. - TensorMap> t_2d(storage, 16, 8); - - // You can also map fixed-size tensors. Here we get a 1d view of - // the 2d fixed-size tensor. - TensorFixedSize> t_4x3; - TensorMap> t_12(t_4x3.data(), 12); - - -#### Class `TensorRef` - -See Assigning to a TensorRef below. - -## Accessing Tensor Elements - -#### ` tensor(index0, index1...)` - -Return the element at position `(index0, index1...)` in tensor -`tensor`. You must pass as many parameters as the rank of `tensor`. -The expression can be used as an l-value to set the value of the element at the -specified position. The value returned is of the datatype of the tensor. - - // Set the value of the element at position (0, 1, 0); - Tensor t_3d(2, 3, 4); - t_3d(0, 1, 0) = 12.0f; - - // Initialize all elements to random values. - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 4; ++k) { - t_3d(i, j, k) = ...some random value...; - } - } - } - - // Print elements of a tensor. - for (int i = 0; i < 2; ++i) { - LOG(INFO) << t_3d(i, 0, 0); - } - - -## TensorLayout - -The tensor library supports 2 layouts: `ColMajor` (the default) and -`RowMajor`. Only the default column major layout is currently fully -supported, and it is therefore not recommended to attempt to use the row major -layout at the moment. - -The layout of a tensor is optionally specified as part of its type. If not -specified explicitly column major is assumed. - - Tensor col_major; // equivalent to Tensor - TensorMap > row_major(data, ...); - -All the arguments to an expression must use the same layout. Attempting to mix -different layouts will result in a compilation error. - -It is possible to change the layout of a tensor or an expression using the -`swap_layout()` method. Note that this will also reverse the order of the -dimensions. - - Tensor col_major(2, 4); - Tensor row_major(2, 4); - - Tensor col_major_result = col_major; // ok, layouts match - Tensor col_major_result = row_major; // will not compile - - // Simple layout swap - col_major_result = row_major.swap_layout(); - eigen_assert(col_major_result.dimension(0) == 4); - eigen_assert(col_major_result.dimension(1) == 2); - - // Swap the layout and preserve the order of the dimensions - array shuffle(1, 0); - col_major_result = row_major.swap_layout().shuffle(shuffle); - eigen_assert(col_major_result.dimension(0) == 2); - eigen_assert(col_major_result.dimension(1) == 4); - - -## Tensor Operations - -The Eigen Tensor library provides a vast library of operations on Tensors: -numerical operations such as addition and multiplication, geometry operations -such as slicing and shuffling, etc. These operations are available as methods -of the Tensor classes, and in some cases as operator overloads. For example -the following code computes the elementwise addition of two tensors: - - Tensor t1(2, 3, 4); - ...set some values in t1... - Tensor t2(2, 3, 4); - ...set some values in t2... - // Set t3 to the element wise sum of t1 and t2 - Tensor t3 = t1 + t2; - -While the code above looks easy enough, it is important to understand that the -expression `t1 + t2` is not actually adding the values of the tensors. The -expression instead constructs a "tensor operator" object of the class -TensorCwiseBinaryOp, which has references to the tensors -`t1` and `t2`. This is a small C++ object that knows how to add -`t1` and `t2`. It is only when the value of the expression is assigned -to the tensor `t3` that the addition is actually performed. Technically, -this happens through the overloading of `operator=()` in the Tensor class. - -This mechanism for computing tensor expressions allows for lazy evaluation and -optimizations which are what make the tensor library very fast. - -Of course, the tensor operators do nest, and the expression `t1 + t2 * 0.3f` -is actually represented with the (approximate) tree of operators: - - TensorCwiseBinaryOp(t1, TensorCwiseUnaryOp(t2, 0.3f)) - - -### Tensor Operations and C++ "auto" - -Because Tensor operations create tensor operators, the C++ `auto` keyword -does not have its intuitive meaning. Consider these 2 lines of code: - - Tensor t3 = t1 + t2; - auto t4 = t1 + t2; - -In the first line we allocate the tensor `t3` and it will contain the -result of the addition of `t1` and `t2`. In the second line, `t4` -is actually the tree of tensor operators that will compute the addition of -`t1` and `t2`. In fact, `t4` is *not* a tensor and you cannot get -the values of its elements: - - Tensor t3 = t1 + t2; - cout << t3(0, 0, 0); // OK prints the value of t1(0, 0, 0) + t2(0, 0, 0) - - auto t4 = t1 + t2; - cout << t4(0, 0, 0); // Compilation error! - -When you use `auto` you do not get a Tensor as a result but instead a -non-evaluated expression. So only use `auto` to delay evaluation. - -Unfortunately, there is no single underlying concrete type for holding -non-evaluated expressions, hence you have to use auto in the case when you do -want to hold non-evaluated expressions. - -When you need the results of set of tensor computations you have to assign the -result to a Tensor that will be capable of holding onto them. This can be -either a normal Tensor, a fixed size Tensor, or a TensorMap on an existing -piece of memory. All the following will work: - - auto t4 = t1 + t2; - - Tensor result = t4; // Could also be: result(t4); - cout << result(0, 0, 0); - - TensorMap result(, , ...) = t4; - cout << result(0, 0, 0); - - TensorFixedSize> result = t4; - cout << result(0, 0, 0); - -Until you need the results, you can keep the operation around, and even reuse -it for additional operations. As long as you keep the expression as an -operation, no computation is performed. - - // One way to compute exp((t1 + t2) * 0.2f); - auto t3 = t1 + t2; - auto t4 = t3 * 0.2f; - auto t5 = t4.exp(); - Tensor result = t5; - - // Another way, exactly as efficient as the previous one: - Tensor result = ((t1 + t2) * 0.2f).exp(); - -### Controlling When Expression are Evaluated - -There are several ways to control when expressions are evaluated: - -* Assignment to a Tensor, TensorFixedSize, or TensorMap. -* Use of the eval() method. -* Assignment to a TensorRef. - -#### Assigning to a Tensor, TensorFixedSize, or TensorMap. - -The most common way to evaluate an expression is to assign it to a Tensor. In -the example below, the `auto` declarations make the intermediate values -"Operations", not Tensors, and do not cause the expressions to be evaluated. -The assignment to the Tensor `result` causes the evaluation of all the -operations. - - auto t3 = t1 + t2; // t3 is an Operation. - auto t4 = t3 * 0.2f; // t4 is an Operation. - auto t5 = t4.exp(); // t5 is an Operation. - Tensor result = t5; // The operations are evaluated. - -If you know the ranks and sizes of the Operation value you can assign the -Operation to a TensorFixedSize instead of a Tensor, which is a bit more -efficient. - - // We know that the result is a 4x4x2 tensor! - TensorFixedSize> result = t5; - -Simiarly, assigning an expression to a TensorMap causes its evaluation. Like -tensors of type TensorFixedSize, TensorMaps cannot be resized so they have to -have the rank and sizes of the expression that are assigned to them. - -#### Calling `eval()`. - -When you compute large composite expressions, you sometimes want to tell Eigen -that an intermediate value in the expression tree is worth evaluating ahead of -time. This is done by inserting a call to the `eval()` method of the -expression Operation. - - // The previous example could have been written: - Tensor result = ((t1 + t2) * 0.2f).exp(); - - // If you want to compute (t1 + t2) once ahead of time you can write: - Tensor result = ((t1 + t2).eval() * 0.2f).exp(); - -Semantically, calling `eval()` is equivalent to materializing the value of -the expression in a temporary Tensor of the right size. The code above in -effect does: - - // .eval() knows the size! - TensorFixedSize> tmp = t1 + t2; - Tensor result = (tmp * 0.2f).exp(); - -Note that the return value of `eval()` is itself an Operation, so the -following code does not do what you may think: - - // Here t3 is an evaluation Operation. t3 has not been evaluated yet. - auto t3 = (t1 + t2).eval(); - - // You can use t3 in another expression. Still no evaluation. - auto t4 = (t3 * 0.2f).exp(); - - // The value is evaluated when you assign the Operation to a Tensor, using - // an intermediate tensor to represent t3.x - Tensor result = t4; - -While in the examples above calling `eval()` does not make a difference in -performance, in other cases it can make a huge difference. In the expression -below the `broadcast()` expression causes the `X.maximum()` expression -to be evaluated many times: - - Tensor<...> X ...; - Tensor<...> Y = ((X - X.maximum(depth_dim).reshape(dims2d).broadcast(bcast)) - * beta).exp(); - -Inserting a call to `eval()` between the `maximum()` and -`reshape()` calls guarantees that maximum() is only computed once and -greatly speeds-up execution: - - Tensor<...> Y = - ((X - X.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) - * beta).exp(); - -In the other example below, the tensor `Y` is both used in the expression -and its assignment. This is an aliasing problem and if the evaluation is not -done in the right order Y will be updated incrementally during the evaluation -resulting in bogus results: - - Tensor<...> Y ...; - Y = Y / (Y.sum(depth_dim).reshape(dims2d).broadcast(bcast)); - -Inserting a call to `eval()` between the `sum()` and `reshape()` -expressions ensures that the sum is computed before any updates to `Y` are -done. - - Y = Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast)); - -Note that an eval around the full right hand side expression is not needed -because the generated has to compute the i-th value of the right hand side -before assigning it to the left hand side. - -However, if you were assigning the expression value to a shuffle of `Y` -then you would need to force an eval for correctness by adding an `eval()` -call for the right hand side: - - Y.shuffle(...) = - (Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast))).eval(); - - -#### Assigning to a `TensorRef`. - -If you need to access only a few elements from the value of an expression you -can avoid materializing the value in a full tensor by using a TensorRef. - -A TensorRef is a small wrapper class for any Eigen Operation. It provides -overloads for the `()` operator that let you access individual values in -the expression. TensorRef is convenient, because the Operation themselves do -not provide a way to access individual elements. - - // Create a TensorRef for the expression. The expression is not - // evaluated yet. - TensorRef > ref = ((t1 + t2) * 0.2f).exp(); - - // Use "ref" to access individual elements. The expression is evaluated - // on the fly. - float at_0 = ref(0, 0, 0); - cout << ref(0, 1, 0); - -Only use TensorRef when you need a subset of the values of the expression. -TensorRef only computes the values you access. However note that if you are -going to access all the values it will be much faster to materialize the -results in a Tensor first. - -In some cases, if the full Tensor result would be very large, you may save -memory by accessing it as a TensorRef. But not always. So don't count on it. - - -### Controlling How Expressions Are Evaluated - -The tensor library provides several implementations of the various operations -such as contractions and convolutions. The implementations are optimized for -different environments: single threaded on CPU, multi threaded on CPU, or on a -GPU using cuda. Additional implementations may be added later. - -You can choose which implementation to use with the `device()` call. If -you do not choose an implementation explicitly the default implementation that -uses a single thread on the CPU is used. - -The default implementation has been optimized for recent Intel CPUs, taking -advantage of SSE, AVX, and FMA instructions. Work is ongoing to tune the -library on ARM CPUs. Note that you need to pass compiler-dependent flags -to enable the use of SSE, AVX, and other instructions. - -For example, the following code adds two tensors using the default -single-threaded CPU implementation: - - Tensor a(30, 40); - Tensor b(30, 40); - Tensor c = a + b; - -To choose a different implementation you have to insert a `device()` call -before the assignment of the result. For technical C++ reasons this requires -that the Tensor for the result be declared on its own. This means that you -have to know the size of the result. - - Eigen::Tensor c(30, 40); - c.device(...) = a + b; - -The call to `device()` must be the last call on the left of the operator=. - -You must pass to the `device()` call an Eigen device object. There are -presently three devices you can use: DefaultDevice, ThreadPoolDevice and -GpuDevice. - - -#### Evaluating With the DefaultDevice - -This is exactly the same as not inserting a `device()` call. - - DefaultDevice my_device; - c.device(my_device) = a + b; - -#### Evaluating with a Thread Pool - - // Create the Eigen ThreadPoolDevice. - Eigen::ThreadPoolDevice my_device(4 /* number of threads to use */); - - // Now just use the device when evaluating expressions. - Eigen::Tensor c(30, 50); - c.device(my_device) = a.contract(b, dot_product_dims); - - -#### Evaluating On GPU - -This is presently a bit more complicated than just using a thread pool device. -You need to create a GPU device but you also need to explicitly allocate the -memory for tensors with cuda. - - -## API Reference - -### Datatypes - -In the documentation of the tensor methods and Operation we mention datatypes -that are tensor-type specific: - -#### `::``Dimensions` - -Acts like an array of ints. Has an `int size` attribute, and can be -indexed like an array to access individual values. Used to represent the -dimensions of a tensor. See `dimensions()`. - -#### `::``Index` - -Acts like an `int`. Used for indexing tensors along their dimensions. See -`operator()`, `dimension()`, and `size()`. - -#### `::``Scalar` - -Represents the datatype of individual tensor elements. For example, for a -`Tensor`, `Scalar` is the type `float`. See -`setConstant()`. - -#### `` - -We use this pseudo type to indicate that a tensor Operation is returned by a -method. We indicate in the text the type and dimensions of the tensor that the -Operation returns after evaluation. - -The Operation will have to be evaluated, for example by assigning it to a -tensor, before you can access the values of the resulting tensor. You can also -access the values through a TensorRef. - - -## Built-in Tensor Methods - -These are usual C++ methods that act on tensors immediately. They are not -Operations which provide delayed evaluation of their results. Unless specified -otherwise, all the methods listed below are available on all tensor classes: -Tensor, TensorFixedSize, and TensorMap. - -## Metadata - -### `int NumDimensions` - -Constant value indicating the number of dimensions of a Tensor. This is also -known as the tensor "rank". - - Eigen::Tensor a(3, 4); - cout << "Dims " << a.NumDimensions; - => Dims 2 - -### `Dimensions dimensions()` - -Returns an array-like object representing the dimensions of the tensor. -The actual type of the `dimensions()` result is `::``Dimensions`. - - Eigen::Tensor a(3, 4); - const Eigen::Tensor::Dimensions& d = a.dimensions(); - cout << "Dim size: " << d.size << ", dim 0: " << d[0] - << ", dim 1: " << d[1]; - => Dim size: 2, dim 0: 3, dim 1: 4 - -If you use a C++11 compiler, you can use `auto` to simplify the code: - - const auto& d = a.dimensions(); - cout << "Dim size: " << d.size << ", dim 0: " << d[0] - << ", dim 1: " << d[1]; - => Dim size: 2, dim 0: 3, dim 1: 4 - -### `Index dimension(Index n)` - -Returns the n-th dimension of the tensor. The actual type of the -`dimension()` result is `::``Index`, but you can -always use it like an int. - - Eigen::Tensor a(3, 4); - int dim1 = a.dimension(1); - cout << "Dim 1: " << dim1; - => Dim 1: 4 - -### `Index size()` - -Returns the total number of elements in the tensor. This is the product of all -the tensor dimensions. The actual type of the `size()` result is -`::``Index`, but you can always use it like an int. - - Eigen::Tensor a(3, 4); - cout << "Size: " << a.size(); - => Size: 12 - - -### Getting Dimensions From An Operation - -A few operations provide `dimensions()` directly, -e.g. `TensorReslicingOp`. Most operations defer calculating dimensions -until the operation is being evaluated. If you need access to the dimensions -of a deferred operation, you can wrap it in a TensorRef (see Assigning to a -TensorRef above), which provides `dimensions()` and `dimension()` as -above. - -TensorRef can also wrap the plain Tensor types, so this is a useful idiom in -templated contexts where the underlying object could be either a raw Tensor -or some deferred operation (e.g. a slice of a Tensor). In this case, the -template code can wrap the object in a TensorRef and reason about its -dimensionality while remaining agnostic to the underlying type. - - -## Constructors - -### Tensor - -Creates a tensor of the specified size. The number of arguments must be equal -to the rank of the tensor. The content of the tensor is not initialized. - - Eigen::Tensor a(3, 4); - cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl; - => NumRows: 3 NumCols: 4 - -### TensorFixedSize - -Creates a tensor of the specified size. The number of arguments in the Sizes<> -template parameter determines the rank of the tensor. The content of the tensor -is not initialized. - - Eigen::TensorFixedSize> a; - cout << "Rank: " << a.rank() << endl; - => Rank: 2 - cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl; - => NumRows: 3 NumCols: 4 - -### TensorMap - -Creates a tensor mapping an existing array of data. The data must not be freed -until the TensorMap is discarded, and the size of the data must be large enough -to accommodate the coefficients of the tensor. - - float data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - Eigen::TensorMap> a(data, 3, 4); - cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl; - => NumRows: 3 NumCols: 4 - cout << "a(1, 2): " << a(1, 2) << endl; - => a(1, 2): 7 - - -## Contents Initialization - -When a new Tensor or a new TensorFixedSize are created, memory is allocated to -hold all the tensor elements, but the memory is not initialized. Similarly, -when a new TensorMap is created on top of non-initialized memory the memory its -contents are not initialized. - -You can use one of the methods below to initialize the tensor memory. These -have an immediate effect on the tensor and return the tensor itself as a -result. These are not tensor Operations which delay evaluation. - -### ` setConstant(const Scalar& val)` - -Sets all elements of the tensor to the constant value `val`. `Scalar` -is the type of data stored in the tensor. You can pass any value that is -convertible to that type. - -Returns the tensor itself in case you want to chain another call. - - a.setConstant(12.3f); - cout << "Constant: " << endl << a << endl << endl; - => - Constant: - 12.3 12.3 12.3 12.3 - 12.3 12.3 12.3 12.3 - 12.3 12.3 12.3 12.3 - -Note that `setConstant()` can be used on any tensor where the element type -has a copy constructor and an `operator=()`: - - Eigen::Tensor a(2, 3); - a.setConstant("yolo"); - cout << "String tensor: " << endl << a << endl << endl; - => - String tensor: - yolo yolo yolo - yolo yolo yolo - - -### ` setZero()` - -Fills the tensor with zeros. Equivalent to `setConstant(Scalar(0))`. -Returns the tensor itself in case you want to chain another call. - - a.setZero(); - cout << "Zeros: " << endl << a << endl << endl; - => - Zeros: - 0 0 0 0 - 0 0 0 0 - 0 0 0 0 - - -### ` setValues({..initializer_list})` - -Fills the tensor with explicit values specified in a std::initializer_list. -The type of the initializer list depends on the type and rank of the tensor. - -If the tensor has rank N, the initializer list must be nested N times. The -most deeply nested lists must contains P scalars of the Tensor type where P is -the size of the last dimension of the Tensor. - -For example, for a `TensorFixedSize` the initializer list must -contains 2 lists of 3 floats each. - -`setValues()` returns the tensor itself in case you want to chain another -call. - - Eigen::Tensor a(2, 3); - a.setValues({{0.0f, 1.0f, 2.0f}, {3.0f, 4.0f, 5.0f}}); - cout << "a" << endl << a << endl << endl; - => - a - 0 1 2 - 3 4 5 - -If a list is too short, the corresponding elements of the tensor will not be -changed. This is valid at each level of nesting. For example the following -code only sets the values of the first row of the tensor. - - Eigen::Tensor a(2, 3); - a.setConstant(1000); - a.setValues({{10, 20, 30}}); - cout << "a" << endl << a << endl << endl; - => - a - 10 20 30 - 1000 1000 1000 - -### ` setRandom()` - -Fills the tensor with random values. Returns the tensor itself in case you -want to chain another call. - - a.setRandom(); - cout << "Random: " << endl << a << endl << endl; - => - Random: - 0.680375 0.59688 -0.329554 0.10794 - -0.211234 0.823295 0.536459 -0.0452059 - 0.566198 -0.604897 -0.444451 0.257742 - -You can customize `setRandom()` by providing your own random number -generator as a template argument: - - a.setRandom(); - -Here, `MyRandomGenerator` must be a struct with the following member -functions, where Scalar and Index are the same as `::``Scalar` -and `::``Index`. - -See `struct UniformRandomGenerator` in TensorFunctors.h for an example. - - // Custom number generator for use with setRandom(). - struct MyRandomGenerator { - // Default and copy constructors. Both are needed - MyRandomGenerator() { } - MyRandomGenerator(const MyRandomGenerator& ) { } - - // Return a random value to be used. "element_location" is the - // location of the entry to set in the tensor, it can typically - // be ignored. - Scalar operator()(Eigen::DenseIndex element_location, - Eigen::DenseIndex /*unused*/ = 0) const { - return ; - } - - // Same as above but generates several numbers at a time. - typename internal::packet_traits::type packetOp( - Eigen::DenseIndex packet_location, Eigen::DenseIndex /*unused*/ = 0) const { - return ; - } - }; - -You can also use one of the 2 random number generators that are part of the -tensor library: -* UniformRandomGenerator -* NormalRandomGenerator - - -## Data Access - -The Tensor, TensorFixedSize, and TensorRef classes provide the following -accessors to access the tensor coefficients: - - const Scalar& operator()(const array& indices) - const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) - Scalar& operator()(const array& indices) - Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) - -The number of indices must be equal to the rank of the tensor. Moreover, these -accessors are not available on tensor expressions. In order to access the -values of a tensor expression, the expression must either be evaluated or -wrapped in a TensorRef. - - -### `Scalar* data()` and `const Scalar* data() const` - -Returns a pointer to the storage for the tensor. The pointer is const if the -tensor was const. This allows direct access to the data. The layout of the -data depends on the tensor layout: RowMajor or ColMajor. - -This access is usually only needed for special cases, for example when mixing -Eigen Tensor code with other libraries. - -Scalar is the type of data stored in the tensor. - - Eigen::Tensor a(3, 4); - float* a_data = a.data(); - a_data[0] = 123.45f; - cout << "a(0, 0): " << a(0, 0); - => a(0, 0): 123.45 - - -## Tensor Operations - -All the methods documented below return non evaluated tensor `Operations`. -These can be chained: you can apply another Tensor Operation to the value -returned by the method. - -The chain of Operation is evaluated lazily, typically when it is assigned to a -tensor. See "Controlling when Expression are Evaluated" for more details about -their evaluation. - -### ` constant(const Scalar& val)` - -Returns a tensor of the same type and dimensions as the original tensor but -where all elements have the value `val`. - -This is useful, for example, when you want to add or subtract a constant from a -tensor, or multiply every element of a tensor by a scalar. - - Eigen::Tensor a(2, 3); - a.setConstant(1.0f); - Eigen::Tensor b = a + a.constant(2.0f); - Eigen::Tensor c = b * b.constant(0.2f); - cout << "a" << endl << a << endl << endl; - cout << "b" << endl << b << endl << endl; - cout << "c" << endl << c << endl << endl; - => - a - 1 1 1 - 1 1 1 - - b - 3 3 3 - 3 3 3 - - c - 0.6 0.6 0.6 - 0.6 0.6 0.6 - -### ` random()` - -Returns a tensor of the same type and dimensions as the current tensor -but where all elements have random values. - -This is for example useful to add random values to an existing tensor. -The generation of random values can be customized in the same manner -as for `setRandom()`. - - Eigen::Tensor a(2, 3); - a.setConstant(1.0f); - Eigen::Tensor b = a + a.random(); - cout << "a" << endl << a << endl << endl; - cout << "b" << endl << b << endl << endl; - => - a - 1 1 1 - 1 1 1 - - b - 1.68038 1.5662 1.82329 - 0.788766 1.59688 0.395103 - - -## Unary Element Wise Operations - -All these operations take a single input tensor as argument and return a tensor -of the same type and dimensions as the tensor to which they are applied. The -requested operations are applied to each element independently. - -### ` operator-()` - -Returns a tensor of the same type and dimensions as the original tensor -containing the opposite values of the original tensor. - - Eigen::Tensor a(2, 3); - a.setConstant(1.0f); - Eigen::Tensor b = -a; - cout << "a" << endl << a << endl << endl; - cout << "b" << endl << b << endl << endl; - => - a - 1 1 1 - 1 1 1 - - b - -1 -1 -1 - -1 -1 -1 - -### ` sqrt()` - -Returns a tensor of the same type and dimensions as the original tensor -containing the square roots of the original tensor. - -### ` rsqrt()` - -Returns a tensor of the same type and dimensions as the original tensor -containing the inverse square roots of the original tensor. - -### ` square()` - -Returns a tensor of the same type and dimensions as the original tensor -containing the squares of the original tensor values. - -### ` inverse()` - -Returns a tensor of the same type and dimensions as the original tensor -containing the inverse of the original tensor values. - -### ` exp()` - -Returns a tensor of the same type and dimensions as the original tensor -containing the exponential of the original tensor. - -### ` log()` - -Returns a tensor of the same type and dimensions as the original tensor -containing the natural logarithms of the original tensor. - -### ` abs()` - -Returns a tensor of the same type and dimensions as the original tensor -containing the absolute values of the original tensor. - -### ` pow(Scalar exponent)` - -Returns a tensor of the same type and dimensions as the original tensor -containing the coefficients of the original tensor to the power of the -exponent. - -The type of the exponent, Scalar, is always the same as the type of the -tensor coefficients. For example, only integer exponents can be used in -conjuntion with tensors of integer values. - -You can use cast() to lift this restriction. For example this computes -cubic roots of an int Tensor: - - Eigen::Tensor a(2, 3); - a.setValues({{0, 1, 8}, {27, 64, 125}}); - Eigen::Tensor b = a.cast().pow(1.0 / 3.0); - cout << "a" << endl << a << endl << endl; - cout << "b" << endl << b << endl << endl; - => - a - 0 1 8 - 27 64 125 - - b - 0 1 2 - 3 4 5 - -### ` operator * (Scalar scale)` - -Multiplies all the coefficients of the input tensor by the provided scale. - -### ` cwiseMax(Scalar threshold)` -TODO - -### ` cwiseMin(Scalar threshold)` -TODO - -### ` unaryExpr(const CustomUnaryOp& func)` -TODO - - -## Binary Element Wise Operations - -These operations take two input tensors as arguments. The 2 input tensors should -be of the same type and dimensions. The result is a tensor of the same -dimensions as the tensors to which they are applied, and unless otherwise -specified it is also of the same type. The requested operations are applied to -each pair of elements independently. - -### ` operator+(const OtherDerived& other)` - -Returns a tensor of the same type and dimensions as the input tensors -containing the coefficient wise sums of the inputs. - -### ` operator-(const OtherDerived& other)` - -Returns a tensor of the same type and dimensions as the input tensors -containing the coefficient wise differences of the inputs. - -### ` operator*(const OtherDerived& other)` - -Returns a tensor of the same type and dimensions as the input tensors -containing the coefficient wise products of the inputs. - -### ` operator/(const OtherDerived& other)` - -Returns a tensor of the same type and dimensions as the input tensors -containing the coefficient wise quotients of the inputs. - -This operator is not supported for integer types. - -### ` cwiseMax(const OtherDerived& other)` - -Returns a tensor of the same type and dimensions as the input tensors -containing the coefficient wise maximums of the inputs. - -### ` cwiseMin(const OtherDerived& other)` - -Returns a tensor of the same type and dimensions as the input tensors -containing the coefficient wise mimimums of the inputs. - -### ` Logical operators` - -The following logical operators are supported as well: - -* operator&&(const OtherDerived& other) -* operator||(const OtherDerived& other) -* operator<(const OtherDerived& other) -* operator<=(const OtherDerived& other) -* operator>(const OtherDerived& other) -* operator>=(const OtherDerived& other) -* operator==(const OtherDerived& other) -* operator!=(const OtherDerived& other) - -They all return a tensor of boolean values. - - -## Selection (select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) - -Selection is a coefficient-wise ternary operator that is the tensor equivalent -to the if-then-else operation. - - Tensor if = ...; - Tensor then = ...; - Tensor else = ...; - Tensor result = if.select(then, else); - -The 3 arguments must be of the same dimensions, which will also be the dimension -of the result. The 'if' tensor must be of type boolean, the 'then' and the -'else' tensor must be of the same type, which will also be the type of the -result. - -Each coefficient in the result is equal to the corresponding coefficient in the -'then' tensor if the corresponding value in the 'if' tensor is true. If not, the -resulting coefficient will come from the 'else' tensor. - - -## Contraction - -Tensor *contractions* are a generalization of the matrix product to the -multidimensional case. - - // Create 2 matrices using tensors of rank 2 - Eigen::Tensor a(2, 3); - a.setValues({{1, 2, 3}, {6, 5, 4}}); - Eigen::Tensor b(3, 2); - b.setValues({{1, 2}, {4, 5}, {5, 6}}); - - // Compute the traditional matrix product - Eigen::array, 1> product_dims = { Eigen::IndexPair(1, 0) }; - Eigen::Tensor AB = a.contract(b, product_dims); - - // Compute the product of the transpose of the matrices - Eigen::array, 1> transposed_product_dims = { Eigen::IndexPair(0, 1) }; - Eigen::Tensor AtBt = a.contract(b, transposed_product_dims); - - // Contraction to scalar value using a double contraction. - // First coordinate of both tensors are contracted as well as both second coordinates, i.e., this computes the sum of the squares of the elements. - Eigen::array, 2> double_contraction_product_dims = { Eigen::IndexPair(0, 0), Eigen::IndexPair(1, 1) }; - Eigen::Tensor AdoubleContractedA = a.contract(a, double_contraction_product_dims); - - // Extracting the scalar value of the tensor contraction for further usage - int value = AdoubleContractedA(0); - -## Reduction Operations - -A *Reduction* operation returns a tensor with fewer dimensions than the -original tensor. The values in the returned tensor are computed by applying a -*reduction operator* to slices of values from the original tensor. You specify -the dimensions along which the slices are made. - -The Eigen Tensor library provides a set of predefined reduction operators such -as `maximum()` and `sum()` and lets you define additional operators by -implementing a few methods from a reductor template. - -### Reduction Dimensions - -All reduction operations take a single parameter of type -`::``Dimensions` which can always be specified as an array of -ints. These are called the "reduction dimensions." The values are the indices -of the dimensions of the input tensor over which the reduction is done. The -parameter can have at most as many element as the rank of the input tensor; -each element must be less than the tensor rank, as it indicates one of the -dimensions to reduce. - -Each dimension of the input tensor should occur at most once in the reduction -dimensions as the implementation does not remove duplicates. - -The order of the values in the reduction dimensions does not affect the -results, but the code may execute faster if you list the dimensions in -increasing order. - -Example: Reduction along one dimension. - - // Create a tensor of 2 dimensions - Eigen::Tensor a(2, 3); - a.setValues({{1, 2, 3}, {6, 5, 4}}); - // Reduce it along the second dimension (1)... - Eigen::array dims({1 /* dimension to reduce */}); - // ...using the "maximum" operator. - // The result is a tensor with one dimension. The size of - // that dimension is the same as the first (non-reduced) dimension of a. - Eigen::Tensor b = a.maximum(dims); - cout << "a" << endl << a << endl << endl; - cout << "b" << endl << b << endl << endl; - => - a - 1 2 3 - 6 5 4 - - b - 3 - 6 - -Example: Reduction along two dimensions. - - Eigen::Tensor a(2, 3, 4); - a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f}, - {7.0f, 6.0f, 5.0f, 4.0f}, - {8.0f, 9.0f, 10.0f, 11.0f}}, - {{12.0f, 13.0f, 14.0f, 15.0f}, - {19.0f, 18.0f, 17.0f, 16.0f}, - {20.0f, 21.0f, 22.0f, 23.0f}}}); - // The tensor a has 3 dimensions. We reduce along the - // first 2, resulting in a tensor with a single dimension - // of size 4 (the last dimension of a.) - // Note that we pass the array of reduction dimensions - // directly to the maximum() call. - Eigen::Tensor b = - a.maximum(Eigen::array({0, 1})); - cout << "b" << endl << b << endl << endl; - => - b - 20 - 21 - 22 - 23 - -#### Reduction along all dimensions - -As a special case, if you pass no parameter to a reduction operation the -original tensor is reduced along *all* its dimensions. The result is a -scalar, represented as a zero-dimension tensor. - - Eigen::Tensor a(2, 3, 4); - a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f}, - {7.0f, 6.0f, 5.0f, 4.0f}, - {8.0f, 9.0f, 10.0f, 11.0f}}, - {{12.0f, 13.0f, 14.0f, 15.0f}, - {19.0f, 18.0f, 17.0f, 16.0f}, - {20.0f, 21.0f, 22.0f, 23.0f}}}); - // Reduce along all dimensions using the sum() operator. - Eigen::Tensor b = a.sum(); - cout << "b" << endl << b << endl << endl; - => - b - 276 - - -### ` sum(const Dimensions& new_dims)` -### ` sum()` - -Reduce a tensor using the sum() operator. The resulting values -are the sum of the reduced values. - -### ` mean(const Dimensions& new_dims)` -### ` mean()` - -Reduce a tensor using the mean() operator. The resulting values -are the mean of the reduced values. - -### ` maximum(const Dimensions& new_dims)` -### ` maximum()` - -Reduce a tensor using the maximum() operator. The resulting values are the -largest of the reduced values. - -### ` minimum(const Dimensions& new_dims)` -### ` minimum()` - -Reduce a tensor using the minimum() operator. The resulting values -are the smallest of the reduced values. - -### ` prod(const Dimensions& new_dims)` -### ` prod()` - -Reduce a tensor using the prod() operator. The resulting values -are the product of the reduced values. - -### ` all(const Dimensions& new_dims)` -### ` all()` -Reduce a tensor using the all() operator. Casts tensor to bool and then checks -whether all elements are true. Runs through all elements rather than -short-circuiting, so may be significantly inefficient. - -### ` any(const Dimensions& new_dims)` -### ` any()` -Reduce a tensor using the any() operator. Casts tensor to bool and then checks -whether any element is true. Runs through all elements rather than -short-circuiting, so may be significantly inefficient. - - -### ` reduce(const Dimensions& new_dims, const Reducer& reducer)` - -Reduce a tensor using a user-defined reduction operator. See `SumReducer` -in TensorFunctors.h for information on how to implement a reduction operator. - - -## Scan Operations - -A *Scan* operation returns a tensor with the same dimensions as the original -tensor. The operation performs an inclusive scan along the specified -axis, which means it computes a running total along the axis for a given -reduction operation. -If the reduction operation corresponds to summation, then this computes the -prefix sum of the tensor along the given axis. - -Example: -dd a comment to this line - - // Create a tensor of 2 dimensions - Eigen::Tensor a(2, 3); - a.setValues({{1, 2, 3}, {4, 5, 6}}); - // Scan it along the second dimension (1) using summation - Eigen::Tensor b = a.cumsum(1); - // The result is a tensor with the same size as the input - cout << "a" << endl << a << endl << endl; - cout << "b" << endl << b << endl << endl; - => - a - 1 2 3 - 4 5 6 - - b - 1 3 6 - 4 9 15 - -### ` cumsum(const Index& axis)` - -Perform a scan by summing consecutive entries. - -### ` cumprod(const Index& axis)` - -Perform a scan by multiplying consecutive entries. - - -## Convolutions - -### ` convolve(const Kernel& kernel, const Dimensions& dims)` - -Returns a tensor that is the output of the convolution of the input tensor with the kernel, -along the specified dimensions of the input tensor. The dimension size for dimensions of the output tensor -which were part of the convolution will be reduced by the formula: -output_dim_size = input_dim_size - kernel_dim_size + 1 (requires: input_dim_size >= kernel_dim_size). -The dimension sizes for dimensions that were not part of the convolution will remain the same. -Performance of the convolution can depend on the length of the stride(s) of the input tensor dimension(s) along which the -convolution is computed (the first dimension has the shortest stride for ColMajor, whereas RowMajor's shortest stride is -for the last dimension). - - // Compute convolution along the second and third dimension. - Tensor input(3, 3, 7, 11); - Tensor kernel(2, 2); - Tensor output(3, 2, 6, 11); - input.setRandom(); - kernel.setRandom(); - - Eigen::array dims({1, 2}); // Specify second and third dimension for convolution. - output = input.convolve(kernel, dims); - - for (int i = 0; i < 3; ++i) { - for (int j = 0; j < 2; ++j) { - for (int k = 0; k < 6; ++k) { - for (int l = 0; l < 11; ++l) { - const float result = output(i,j,k,l); - const float expected = input(i,j+0,k+0,l) * kernel(0,0) + - input(i,j+1,k+0,l) * kernel(1,0) + - input(i,j+0,k+1,l) * kernel(0,1) + - input(i,j+1,k+1,l) * kernel(1,1); - VERIFY_IS_APPROX(result, expected); - } - } - } - } - - -## Geometrical Operations - -These operations return a Tensor with different dimensions than the original -Tensor. They can be used to access slices of tensors, see them with different -dimensions, or pad tensors with additional data. - -### ` reshape(const Dimensions& new_dims)` - -Returns a view of the input tensor that has been reshaped to the specified -new dimensions. The argument new_dims is an array of Index values. The -rank of the resulting tensor is equal to the number of elements in new_dims. - -The product of all the sizes in the new dimension array must be equal to -the number of elements in the input tensor. - - // Increase the rank of the input tensor by introducing a new dimension - // of size 1. - Tensor input(7, 11); - array three_dims{{7, 11, 1}}; - Tensor result = input.reshape(three_dims); - - // Decrease the rank of the input tensor by merging 2 dimensions; - array one_dim{{7 * 11}}; - Tensor result = input.reshape(one_dim); - -This operation does not move any data in the input tensor, so the resulting -contents of a reshaped Tensor depend on the data layout of the original Tensor. - -For example this is what happens when you `reshape()` a 2D ColMajor tensor -to one dimension: - - Eigen::Tensor a(2, 3); - a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); - Eigen::array one_dim({3 * 2}); - Eigen::Tensor b = a.reshape(one_dim); - cout << "b" << endl << b << endl; - => - b - 0 - 300 - 100 - 400 - 200 - 500 - -This is what happens when the 2D Tensor is RowMajor: - - Eigen::Tensor a(2, 3); - a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); - Eigen::array one_dim({3 * 2}); - Eigen::Tensor b = a.reshape(one_dim); - cout << "b" << endl << b << endl; - => - b - 0 - 100 - 200 - 300 - 400 - 500 - -The reshape operation is a lvalue. In other words, it can be used on the left -side of the assignment operator. - -The previous example can be rewritten as follow: - - Eigen::Tensor a(2, 3); - a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); - Eigen::array two_dim({2, 3}); - Eigen::Tensor b(6); - b.reshape(two_dim) = a; - cout << "b" << endl << b << endl; - => - b - 0 - 300 - 100 - 400 - 200 - 500 - -Note that "b" itself was not reshaped but that instead the assignment is done to -the reshape view of b. - - -### ` shuffle(const Shuffle& shuffle)` - -Returns a copy of the input tensor whose dimensions have been -reordered according to the specified permutation. The argument shuffle -is an array of Index values. Its size is the rank of the input -tensor. It must contain a permutation of 0, 1, ..., rank - 1. The i-th -dimension of the output tensor equals to the size of the shuffle[i]-th -dimension of the input tensor. For example: - - // Shuffle all dimensions to the left by 1. - Tensor input(20, 30, 50); - // ... set some values in input. - Tensor output = input.shuffle({1, 2, 0}) - - eigen_assert(output.dimension(0) == 30); - eigen_assert(output.dimension(1) == 50); - eigen_assert(output.dimension(2) == 20); - -Indices into the output tensor are shuffled accordingly to formulate -indices into the input tensor. For example, one can assert in the above -code snippet that: - - eigen_assert(output(3, 7, 11) == input(11, 3, 7)); - -In general, one can assert that - - eigen_assert(output(..., indices[shuffle[i]], ...) == - input(..., indices[i], ...)) - -The shuffle operation results in a lvalue, which means that it can be assigned -to. In other words, it can be used on the left side of the assignment operator. - -Let's rewrite the previous example to take advantage of this feature: - - // Shuffle all dimensions to the left by 1. - Tensor input(20, 30, 50); - // ... set some values in input. - Tensor output(30, 50, 20); - output.shuffle({2, 0, 1}) = input; - - -### ` stride(const Strides& strides)` - -Returns a view of the input tensor that strides (skips stride-1 -elements) along each of the dimensions. The argument strides is an -array of Index values. The dimensions of the resulting tensor are -ceil(input_dimensions[i] / strides[i]). - -For example this is what happens when you `stride()` a 2D tensor: - - Eigen::Tensor a(4, 3); - a.setValues({{0, 100, 200}, {300, 400, 500}, {600, 700, 800}, {900, 1000, 1100}}); - Eigen::array strides({3, 2}); - Eigen::Tensor b = a.stride(strides); - cout << "b" << endl << b << endl; - => - b - 0 200 - 900 1100 - -It is possible to assign a tensor to a stride: - Tensor input(20, 30, 50); - // ... set some values in input. - Tensor output(40, 90, 200); - output.stride({2, 3, 4}) = input; - - -### ` slice(const StartIndices& offsets, const Sizes& extents)` - -Returns a sub-tensor of the given tensor. For each dimension i, the slice is -made of the coefficients stored between offset[i] and offset[i] + extents[i] in -the input tensor. - - Eigen::Tensor a(4, 3); - a.setValues({{0, 100, 200}, {300, 400, 500}, - {600, 700, 800}, {900, 1000, 1100}}); - Eigen::array offsets = {1, 0}; - Eigen::array extents = {2, 2}; - Eigen::Tensor slice = a.slice(offsets, extents); - cout << "a" << endl << a << endl; - => - a - 0 100 200 - 300 400 500 - 600 700 800 - 900 1000 1100 - cout << "slice" << endl << slice << endl; - => - slice - 300 400 - 600 700 - - -### ` chip(const Index offset, const Index dim)` - -A chip is a special kind of slice. It is the subtensor at the given offset in -the dimension dim. The returned tensor has one fewer dimension than the input -tensor: the dimension dim is removed. - -For example, a matrix chip would be either a row or a column of the input -matrix. - - Eigen::Tensor a(4, 3); - a.setValues({{0, 100, 200}, {300, 400, 500}, - {600, 700, 800}, {900, 1000, 1100}}); - Eigen::Tensor row_3 = a.chip(2, 0); - Eigen::Tensor col_2 = a.chip(1, 1); - cout << "a" << endl << a << endl; - => - a - 0 100 200 - 300 400 500 - 600 700 800 - 900 1000 1100 - cout << "row_3" << endl << row_3 << endl; - => - row_3 - 600 700 800 - cout << "col_2" << endl << col_2 << endl; - => - col_2 - 100 400 700 1000 - -It is possible to assign values to a tensor chip since the chip operation is a -lvalue. For example: - - Eigen::Tensor a(3); - a.setValues({{100, 200, 300}}); - Eigen::Tensor b(2, 3); - b.setZero(); - b.chip(0, 0) = a; - cout << "a" << endl << a << endl; - => - a - 100 - 200 - 300 - cout << "b" << endl << b << endl; - => - b - 100 200 300 - 0 0 0 - - -### ` reverse(const ReverseDimensions& reverse)` - -Returns a view of the input tensor that reverses the order of the coefficients -along a subset of the dimensions. The argument reverse is an array of boolean -values that indicates whether or not the order of the coefficients should be -reversed along each of the dimensions. This operation preserves the dimensions -of the input tensor. - -For example this is what happens when you `reverse()` the first dimension -of a 2D tensor: - - Eigen::Tensor a(4, 3); - a.setValues({{0, 100, 200}, {300, 400, 500}, - {600, 700, 800}, {900, 1000, 1100}}); - Eigen::array reverse({true, false}); - Eigen::Tensor b = a.reverse(reverse); - cout << "a" << endl << a << endl << "b" << endl << b << endl; - => - a - 0 100 200 - 300 400 500 - 600 700 800 - 900 1000 1100 - b - 900 1000 1100 - 600 700 800 - 300 400 500 - 0 100 200 - - -### ` broadcast(const Broadcast& broadcast)` - -Returns a view of the input tensor in which the input is replicated one to many -times. -The broadcast argument specifies how many copies of the input tensor need to be -made in each of the dimensions. - - Eigen::Tensor a(2, 3); - a.setValues({{0, 100, 200}, {300, 400, 500}}); - Eigen::array bcast({3, 2}); - Eigen::Tensor b = a.broadcast(bcast); - cout << "a" << endl << a << endl << "b" << endl << b << endl; - => - a - 0 100 200 - 300 400 500 - b - 0 100 200 0 100 200 - 300 400 500 300 400 500 - 0 100 200 0 100 200 - 300 400 500 300 400 500 - 0 100 200 0 100 200 - 300 400 500 300 400 500 - -### ` concatenate(const OtherDerived& other, Axis axis)` - -TODO - -### ` pad(const PaddingDimensions& padding)` - -Returns a view of the input tensor in which the input is padded with zeros. - - Eigen::Tensor a(2, 3); - a.setValues({{0, 100, 200}, {300, 400, 500}}); - Eigen::array, 2> paddings; - paddings[0] = make_pair(0, 1); - paddings[1] = make_pair(2, 3); - Eigen::Tensor b = a.pad(paddings); - cout << "a" << endl << a << endl << "b" << endl << b << endl; - => - a - 0 100 200 - 300 400 500 - b - 0 0 0 0 - 0 0 0 0 - 0 100 200 0 - 300 400 500 0 - 0 0 0 0 - 0 0 0 0 - 0 0 0 0 - - -### ` extract_patches(const PatchDims& patch_dims)` - -Returns a tensor of coefficient patches extracted from the input tensor, where -each patch is of dimension specified by 'patch_dims'. The returned tensor has -one greater dimension than the input tensor, which is used to index each patch. -The patch index in the output tensor depends on the data layout of the input -tensor: the patch index is the last dimension ColMajor layout, and the first -dimension in RowMajor layout. - -For example, given the following input tensor: - - Eigen::Tensor tensor(3,4); - tensor.setValues({{0.0f, 1.0f, 2.0f, 3.0f}, - {4.0f, 5.0f, 6.0f, 7.0f}, - {8.0f, 9.0f, 10.0f, 11.0f}}); - - cout << "tensor: " << endl << tensor << endl; -=> -tensor: - 0 1 2 3 - 4 5 6 7 - 8 9 10 11 - -Six 2x2 patches can be extracted and indexed using the following code: - - Eigen::Tensor patch; - Eigen::array patch_dims; - patch_dims[0] = 2; - patch_dims[1] = 2; - patch = tensor.extract_patches(patch_dims); - for (int k = 0; k < 6; ++k) { - cout << "patch index: " << k << endl; - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 2; ++j) { - if (DataLayout == ColMajor) { - cout << patch(i, j, k) << " "; - } else { - cout << patch(k, i, j) << " "; - } - } - cout << endl; - } - } - -This code results in the following output when the data layout is ColMajor: - -patch index: 0 -0 1 -4 5 -patch index: 1 -4 5 -8 9 -patch index: 2 -1 2 -5 6 -patch index: 3 -5 6 -9 10 -patch index: 4 -2 3 -6 7 -patch index: 5 -6 7 -10 11 - -This code results in the following output when the data layout is RowMajor: -(NOTE: the set of patches is the same as in ColMajor, but are indexed differently). - -patch index: 0 -0 1 -4 5 -patch index: 1 -1 2 -5 6 -patch index: 2 -2 3 -6 7 -patch index: 3 -4 5 -8 9 -patch index: 4 -5 6 -9 10 -patch index: 5 -6 7 -10 11 - -### ` extract_image_patches(const Index patch_rows, const Index patch_cols, const Index row_stride, const Index col_stride, const PaddingType padding_type)` - -Returns a tensor of coefficient image patches extracted from the input tensor, -which is expected to have dimensions ordered as follows (depending on the data -layout of the input tensor, and the number of additional dimensions 'N'): - -*) ColMajor -1st dimension: channels (of size d) -2nd dimension: rows (of size r) -3rd dimension: columns (of size c) -4th-Nth dimension: time (for video) or batch (for bulk processing). - -*) RowMajor (reverse order of ColMajor) -1st-Nth dimension: time (for video) or batch (for bulk processing). -N+1'th dimension: columns (of size c) -N+2'th dimension: rows (of size r) -N+3'th dimension: channels (of size d) - -The returned tensor has one greater dimension than the input tensor, which is -used to index each patch. The patch index in the output tensor depends on the -data layout of the input tensor: the patch index is the 4'th dimension in -ColMajor layout, and the 4'th from the last dimension in RowMajor layout. - -For example, given the following input tensor with the following dimension -sizes: - *) depth: 2 - *) rows: 3 - *) columns: 5 - *) batch: 7 - - Tensor tensor(2,3,5,7); - Tensor tensor_row_major = tensor.swap_layout(); - -2x2 image patches can be extracted and indexed using the following code: - -*) 2D patch: ColMajor (patch indexed by second-to-last dimension) - Tensor twod_patch; - twod_patch = tensor.extract_image_patches<2, 2>(); - // twod_patch.dimension(0) == 2 - // twod_patch.dimension(1) == 2 - // twod_patch.dimension(2) == 2 - // twod_patch.dimension(3) == 3*5 - // twod_patch.dimension(4) == 7 - -*) 2D patch: RowMajor (patch indexed by the second dimension) - Tensor twod_patch_row_major; - twod_patch_row_major = tensor_row_major.extract_image_patches<2, 2>(); - // twod_patch_row_major.dimension(0) == 7 - // twod_patch_row_major.dimension(1) == 3*5 - // twod_patch_row_major.dimension(2) == 2 - // twod_patch_row_major.dimension(3) == 2 - // twod_patch_row_major.dimension(4) == 2 - -## Special Operations - -### ` cast()` - -Returns a tensor of type T with the same dimensions as the original tensor. -The returned tensor contains the values of the original tensor converted to -type T. - - Eigen::Tensor a(2, 3); - Eigen::Tensor b = a.cast(); - -This can be useful for example if you need to do element-wise division of -Tensors of integers. This is not currently supported by the Tensor library -but you can easily cast the tensors to floats to do the division: - - Eigen::Tensor a(2, 3); - a.setValues({{0, 1, 2}, {3, 4, 5}}); - Eigen::Tensor b = - (a.cast() / a.constant(2).cast()).cast(); - cout << "a" << endl << a << endl << endl; - cout << "b" << endl << b << endl << endl; - => - a - 0 1 2 - 3 4 5 - - b - 0 0 1 - 1 2 2 - - -### ` eval()` - -TODO - - -## Representation of scalar values - -Scalar values are often represented by tensors of size 1 and rank 0.For example -Tensor::maximum() currently returns a Tensor. Similarly, the inner -product of 2 1d tensors (through contractions) returns a 0d tensor. - -## Limitations - -* The number of tensor dimensions is currently limited to 250 when using a - compiler that supports cxx11. It is limited to only 5 for older compilers. -* The IndexList class requires a cxx11 compliant compiler. You can use an - array of indices instead if you don't have access to a modern compiler. -* On GPUs only floating point values are properly tested and optimized for. -* Complex and integer values are known to be broken on GPUs. If you try to use - them you'll most likely end up triggering a static assertion failure such as - EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - - diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/Tensor.h deleted file mode 100644 index 00295a2..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ /dev/null @@ -1,527 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// Copyright (C) 2013 Christian Seiler -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_H -#define EIGEN_CXX11_TENSOR_TENSOR_H - -namespace Eigen { - -/** \class Tensor - * \ingroup CXX11_Tensor_Module - * - * \brief The tensor class. - * - * The %Tensor class is the work-horse for all \em dense tensors within Eigen. - * - * The %Tensor class encompasses only dynamic-size objects so far. - * - * The first two template parameters are required: - * \tparam Scalar_ Numeric type, e.g. float, double, int or `std::complex`. - * User defined scalar types are supported as well (see \ref user_defined_scalars "here"). - * \tparam NumIndices_ Number of indices (i.e. rank of the tensor) - * - * The remaining template parameters are optional -- in most cases you don't have to worry about them. - * \tparam Options_ A combination of either \b #RowMajor or \b #ColMajor, and of either - * \b #AutoAlign or \b #DontAlign. - * The former controls \ref TopicStorageOrders "storage order", and defaults to column-major. The latter controls alignment, which is required - * for vectorization. It defaults to aligning tensors. Note that tensors currently do not support any operations that profit from vectorization. - * Support for such operations (i.e. adding two tensors etc.) is planned. - * - * You can access elements of tensors using normal subscripting: - * - * \code - * Eigen::Tensor t(10, 10, 10, 10); - * t(0, 1, 2, 3) = 42.0; - * \endcode - * - * This class can be extended with the help of the plugin mechanism described on the page - * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_TENSOR_PLUGIN. - * - * Some notes: - * - *
- *
Relation to other parts of Eigen:
- *
The midterm development goal for this class is to have a similar hierarchy as Eigen uses for matrices, so that - * taking blocks or using tensors in expressions is easily possible, including an interface with the vector/matrix code - * by providing .asMatrix() and .asVector() (or similar) methods for rank 2 and 1 tensors. However, currently, the %Tensor - * class does not provide any of these features and is only available as a stand-alone class that just allows for - * coefficient access. Also, when fixed-size tensors are implemented, the number of template arguments is likely to - * change dramatically.
- *
- * - * \ref TopicStorageOrders - */ - -template -class Tensor : public TensorBase > -{ - public: - typedef Tensor Self; - typedef TensorBase > Base; - typedef typename Eigen::internal::nested::type Nested; - typedef typename internal::traits::StorageKind StorageKind; - typedef typename internal::traits::Index Index; - typedef Scalar_ Scalar; - typedef typename NumTraits::Real RealScalar; - typedef typename Base::CoeffReturnType CoeffReturnType; - - enum { - IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0) & !(Options_&DontAlign), - Layout = Options_ & RowMajor ? RowMajor : ColMajor, - CoordAccess = true, - RawAccess = true - }; - - static const int Options = Options_; - static const int NumIndices = NumIndices_; - typedef DSizes Dimensions; - - protected: - TensorStorage m_storage; - -#ifdef EIGEN_HAS_SFINAE - template - struct isOfNormalIndex{ - static const bool is_array = internal::is_base_of, CustomIndices>::value; - static const bool is_int = NumTraits::IsInteger; - static const bool value = is_array | is_int; - }; -#endif - - public: - // Metadata - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } - - // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED - // work, because that uses base().coeffRef() - and we don't yet - // implement a similar class hierarchy - inline Self& base() { return *this; } - inline const Self& base() const { return *this; } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template - EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const - { - // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return coeff(array{{firstIndex, secondIndex, otherIndices...}}); - } -#endif - - // normal indices - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array& indices) const - { - eigen_internal_assert(checkIndexRange(indices)); - return m_storage.data()[linearizedIndex(indices)]; - } - - // custom indices -#ifdef EIGEN_HAS_SFINAE - template::value) ) - > - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(CustomIndices& indices) const - { - return coeff(internal::customIndices2Array(indices)); - } -#endif - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff() const - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - return m_storage.data()[0]; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const - { - eigen_internal_assert(index >= 0 && index < size()); - return m_storage.data()[index]; - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template - inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) - { - // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return coeffRef(array{{firstIndex, secondIndex, otherIndices...}}); - } -#endif - - // normal indices - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array& indices) - { - eigen_internal_assert(checkIndexRange(indices)); - return m_storage.data()[linearizedIndex(indices)]; - } - - // custom indices -#ifdef EIGEN_HAS_SFINAE - template::value) ) - > - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(CustomIndices& indices) - { - return coeffRef(internal::customIndices2Array(indices)); - } -#endif - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef() - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - return m_storage.data()[0]; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) - { - eigen_internal_assert(index >= 0 && index < size()); - return m_storage.data()[index]; - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template - inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const - { - // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return this->operator()(array{{firstIndex, secondIndex, otherIndices...}}); - } -#else - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const - { - return coeff(array(i0, i1)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const - { - return coeff(array(i0, i1, i2)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const - { - return coeff(array(i0, i1, i2, i3)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const - { - return coeff(array(i0, i1, i2, i3, i4)); - } -#endif - - // custom indices -#ifdef EIGEN_HAS_SFINAE - template::value) ) - > - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(CustomIndices& indices) const - { - return coeff(internal::customIndices2Array(indices)); - } -#endif - - // normal indices - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const - { - return coeff(indices); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const - { - eigen_internal_assert(index >= 0 && index < size()); - return coeff(index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()() const - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - return coeff(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const - { - // The bracket operator is only for vectors, use the parenthesis operator instead. - EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE); - return coeff(index); - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template - inline Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) - { - // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return operator()(array{{firstIndex, secondIndex, otherIndices...}}); - } -#else - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) - { - return coeffRef(array(i0, i1)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2) - { - return coeffRef(array(i0, i1, i2)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) - { - return coeffRef(array(i0, i1, i2, i3)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) - { - return coeffRef(array(i0, i1, i2, i3, i4)); - } -#endif - - // normal indices - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) - { - return coeffRef(indices); - } - - // custom indices -#ifdef EIGEN_HAS_SFINAE - template::value) ) - > - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(CustomIndices& indices) - { - return coeffRef(internal::customIndices2Array(indices)); - } -#endif - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index) - { - eigen_assert(index >= 0 && index < size()); - return coeffRef(index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()() - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - return coeffRef(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index) - { - // The bracket operator is only for vectors, use the parenthesis operator instead - EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - return coeffRef(index); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Tensor() - : m_storage() - { - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Tensor(const Self& other) - : m_storage(other.m_storage) - { - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions) - : m_storage(firstDimension, otherDimensions...) - { - // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - } -#else - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1) - : m_storage(dim1, array(dim1)) - { - EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2) - : m_storage(dim1*dim2, array(dim1, dim2)) - { - EIGEN_STATIC_ASSERT(2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3) - : m_storage(dim1*dim2*dim3, array(dim1, dim2, dim3)) - { - EIGEN_STATIC_ASSERT(3 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4) - : m_storage(dim1*dim2*dim3*dim4, array(dim1, dim2, dim3, dim4)) - { - EIGEN_STATIC_ASSERT(4 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) - : m_storage(dim1*dim2*dim3*dim4*dim5, array(dim1, dim2, dim3, dim4, dim5)) - { - EIGEN_STATIC_ASSERT(5 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - } -#endif - - /** Normal Dimension */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(const array& dimensions) - : m_storage(internal::array_prod(dimensions), dimensions) - { - EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Tensor(const TensorBase& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other.derived()); - resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); - internal::TensorExecutor::run(assign, DefaultDevice()); - } - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Tensor(const TensorBase& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other.derived()); - resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); - internal::TensorExecutor::run(assign, DefaultDevice()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Tensor& operator=(const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template EIGEN_DEVICE_FUNC - void resize(Index firstDimension, IndexTypes... otherDimensions) - { - // The number of dimensions used to resize a tensor must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - resize(array{{firstDimension, otherDimensions...}}); - } -#endif - - /** Normal Dimension */ - EIGEN_DEVICE_FUNC void resize(const array& dimensions) - { - int i; - Index size = Index(1); - for (i = 0; i < NumIndices; i++) { - internal::check_rows_cols_for_overflow::run(size, dimensions[i]); - size *= dimensions[i]; - } - #ifdef EIGEN_INITIALIZE_COEFFS - bool size_changed = size != this->size(); - m_storage.resize(size, dimensions); - if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED - #else - m_storage.resize(size, dimensions); - #endif - } - - // Why this overload, DSizes is derived from array ??? // - EIGEN_DEVICE_FUNC void resize(const DSizes& dimensions) { - array dims; - for (int i = 0; i < NumIndices; ++i) { - dims[i] = dimensions[i]; - } - resize(dims); - } - - EIGEN_DEVICE_FUNC - void resize() - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - // Nothing to do: rank 0 tensors have fixed size - } - - /** Custom Dimension */ -#ifdef EIGEN_HAS_SFINAE - template::value) ) - > - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(CustomDimension& dimensions) - { - resize(internal::customIndices2Array(dimensions)); - } -#endif - -#ifndef EIGEN_EMULATE_CXX11_META_H - template - EIGEN_DEVICE_FUNC - void resize(const Sizes& dimensions) { - array dims; - for (int i = 0; i < NumIndices; ++i) { - dims[i] = static_cast(dimensions[i]); - } - resize(dims); - } -#else - template - EIGEN_DEVICE_FUNC - void resize(const Sizes& dimensions) { - array dims; - for (int i = 0; i < NumIndices; ++i) { - dims[i] = static_cast(dimensions[i]); - } - resize(dims); - } -#endif - - protected: - - bool checkIndexRange(const array& indices) const - { - using internal::array_apply_and_reduce; - using internal::array_zip_and_reduce; - using internal::greater_equal_zero_op; - using internal::logical_and_op; - using internal::lesser_op; - - return - // check whether the indices are all >= 0 - array_apply_and_reduce(indices) && - // check whether the indices fit in the dimensions - array_zip_and_reduce(indices, m_storage.dimensions()); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array& indices) const - { - if (Options&RowMajor) { - return m_storage.dimensions().IndexOfRowMajor(indices); - } else { - return m_storage.dimensions().IndexOfColMajor(indices); - } - } -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h deleted file mode 100644 index d06f40c..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ /dev/null @@ -1,299 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Eugene Brevdo -// Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H -#define EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H - -namespace Eigen { -namespace internal { - -/** \class TensorIndexTuple - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor + Index Tuple class. - * - * - */ -template -struct traits > : public traits -{ - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef Tuple Scalar; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorIndexTupleOp& type; -}; - -template -struct nested, 1, - typename eval >::type> -{ - typedef TensorIndexTupleOp type; -}; - -} // end namespace internal - -template -class TensorIndexTupleOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - typedef Tuple CoeffReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIndexTupleOp(const XprType& expr) - : m_xpr(expr) {} - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; -}; - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorIndexTupleOp XprType; - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - - typedef typename TensorEvaluator::Dimensions Dimensions; - static const int NumDims = internal::array_size::value; - - enum { - IsAligned = /*TensorEvaluator::IsAligned*/ false, - PacketAccess = /*TensorEvaluator::PacketAccess*/ false, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device) { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { - return m_impl.dimensions(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return CoeffReturnType(index, m_impl.coeff(index)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1); - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - - protected: - TensorEvaluator m_impl; -}; - -namespace internal { - -/** \class TensorTupleIndex - * \ingroup CXX11_Tensor_Module - * - * \brief Converts to Tensor > and reduces to Tensor. - * - */ -template -struct traits > : public traits -{ - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef Index Scalar; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions - array_size::value; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorTupleReducerOp& type; -}; - -template -struct nested, 1, - typename eval >::type> -{ - typedef TensorTupleReducerOp type; -}; - -} // end namespace internal - -template -class TensorTupleReducerOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - typedef Index CoeffReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerOp(const XprType& expr, - const ReduceOp& reduce_op, - const int return_dim, - const Dims& reduce_dims) - : m_xpr(expr), m_reduce_op(reduce_op), m_return_dim(return_dim), m_reduce_dims(reduce_dims) {} - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - EIGEN_DEVICE_FUNC - const ReduceOp& reduce_op() const { return m_reduce_op; } - - EIGEN_DEVICE_FUNC - const Dims& reduce_dims() const { return m_reduce_dims; } - - EIGEN_DEVICE_FUNC - int return_dim() const { return m_return_dim; } - - protected: - typename XprType::Nested m_xpr; - const ReduceOp m_reduce_op; - const int m_return_dim; - const Dims m_reduce_dims; -}; - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorTupleReducerOp XprType; - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename TensorIndexTupleOp::CoeffReturnType TupleType; - typedef typename TensorEvaluator >, Device>::Dimensions Dimensions; - typedef typename TensorEvaluator , Device>::Dimensions InputDimensions; - static const int NumDims = internal::array_size::value; - typedef array StrideDims; - - enum { - IsAligned = /*TensorEvaluator::IsAligned*/ false, - PacketAccess = /*TensorEvaluator::PacketAccess*/ false, - BlockAccess = false, - Layout = TensorEvaluator >, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_orig_impl(op.expression(), device), - m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device), - m_return_dim(op.return_dim()) { - - gen_strides(m_orig_impl.dimensions(), m_strides); - if (Layout == static_cast(ColMajor)) { - const Index total_size = internal::array_prod(m_orig_impl.dimensions()); - m_stride_mod = (m_return_dim < NumDims - 1) ? m_strides[m_return_dim + 1] : total_size; - } else { - const Index total_size = internal::array_prod(m_orig_impl.dimensions()); - m_stride_mod = (m_return_dim > 0) ? m_strides[m_return_dim - 1] : total_size; - } - m_stride_div = m_strides[m_return_dim]; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { - return m_impl.dimensions(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - const TupleType v = m_impl.coeff(index); - return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div; - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - const double compute_cost = 1.0 + - (m_return_dim < 0 ? 0.0 : (TensorOpCost::ModCost() + TensorOpCost::DivCost())); - return m_orig_impl.costPerCoeff(vectorized) + - m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost); - } - - private: - EIGEN_DEVICE_FUNC void gen_strides(const InputDimensions& dims, StrideDims& strides) { - if (m_return_dim < 0) { - return; // Won't be using the strides. - } - eigen_assert(m_return_dim < NumDims && - "Asking to convert index to a dimension outside of the rank"); - - // Calculate m_stride_div and m_stride_mod, which are used to - // calculate the value of an index w.r.t. the m_return_dim. - if (Layout == static_cast(ColMajor)) { - strides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - strides[i] = strides[i-1] * dims[i-1]; - } - } else { - strides[NumDims-1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - strides[i] = strides[i+1] * dims[i+1]; - } - } - } - - protected: - TensorEvaluator, Device> m_orig_impl; - TensorEvaluator >, Device> m_impl; - const int m_return_dim; - StrideDims m_strides; - Index m_stride_mod; - Index m_stride_div; -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h deleted file mode 100644 index 166be20..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ /dev/null @@ -1,181 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H -#define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H - -namespace Eigen { - -/** \class TensorAssign - * \ingroup CXX11_Tensor_Module - * - * \brief The tensor assignment class. - * - * This class is represents the assignment of the values resulting from the evaluation of - * the rhs expression to the memory locations denoted by the lhs expression. - */ -namespace internal { -template -struct traits > -{ - typedef typename LhsXprType::Scalar Scalar; - typedef typename traits::StorageKind StorageKind; - typedef typename promote_index_type::Index, - typename traits::Index>::type Index; - typedef typename LhsXprType::Nested LhsNested; - typedef typename RhsXprType::Nested RhsNested; - typedef typename remove_reference::type _LhsNested; - typedef typename remove_reference::type _RhsNested; - static const std::size_t NumDimensions = internal::traits::NumDimensions; - static const int Layout = internal::traits::Layout; - - enum { - Flags = 0 - }; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorAssignOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorAssignOp type; -}; - -} // end namespace internal - - - -template -class TensorAssignOp : public TensorBase > -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename LhsXprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs) - : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {} - - /** \returns the nested expressions */ - EIGEN_DEVICE_FUNC - typename internal::remove_all::type& - lhsExpression() const { return *((typename internal::remove_all::type*)&m_lhs_xpr); } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - rhsExpression() const { return m_rhs_xpr; } - - protected: - typename internal::remove_all::type& m_lhs_xpr; - const typename internal::remove_all::type& m_rhs_xpr; -}; - - -template -struct TensorEvaluator, Device> -{ - typedef TensorAssignOp XprType; - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef typename TensorEvaluator::Dimensions Dimensions; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - RawAccess = TensorEvaluator::RawAccess - }; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : - m_leftImpl(op.lhsExpression(), device), - m_rightImpl(op.rhsExpression(), device) - { - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); - } - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const - { - // The dimensions of the lhs and the rhs tensors should be equal to prevent - // overflows and ensure the result is fully initialized. - // TODO: use left impl instead if right impl dimensions are known at compile time. - return m_rightImpl.dimensions(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { - eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); - m_leftImpl.evalSubExprsIfNeeded(NULL); - // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non - // null value), attempt to evaluate the rhs expression in place. Returns true iff in place - // evaluation isn't supported and the caller still needs to manually assign the values generated - // by the rhs to the lhs. - return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_leftImpl.cleanup(); - m_rightImpl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { - m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { - const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; - const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; - m_leftImpl.template writePacket(i, m_rightImpl.template packet(i)); - } - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const - { - return m_leftImpl.coeff(index); - } - template - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const - { - return m_leftImpl.template packet(index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - // We assume that evalPacket or evalScalar is called to perform the - // assignment and account for the cost of the write here, but reduce left - // cost by one load because we are using m_leftImpl.coeffRef. - TensorOpCost left = m_leftImpl.costPerCoeff(vectorized); - return m_rightImpl.costPerCoeff(vectorized) + - TensorOpCost( - numext::maxi(0.0, left.bytes_loaded() - sizeof(CoeffReturnType)), - left.bytes_stored(), left.compute_cycles()) + - TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); - } - - /// required by sycl in order to extract the accessor - const TensorEvaluator& left_impl() const { return m_leftImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& right_impl() const { return m_rightImpl; } - - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); } - - private: - TensorEvaluator m_leftImpl; - TensorEvaluator m_rightImpl; -}; - -} - - -#endif // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h deleted file mode 100644 index f573608..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ /dev/null @@ -1,1012 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_BASE_H -#define EIGEN_CXX11_TENSOR_TENSOR_BASE_H - -// clang-format off - -namespace Eigen { - -/** \class TensorBase - * \ingroup CXX11_Tensor_Module - * - * \brief The tensor base class. - * - * This class is the common parent of the Tensor and TensorMap class, thus - * making it possible to use either class interchangably in expressions. - */ -#ifndef EIGEN_PARSED_BY_DOXYGEN -// FIXME Doxygen does not like the inheritance with different template parameters -// Since there is no doxygen documentation inside, we disable it for now -template -class TensorBase -{ - public: - typedef internal::traits DerivedTraits; - typedef typename DerivedTraits::Scalar Scalar; - typedef typename DerivedTraits::Index Index; - typedef typename internal::remove_const::type CoeffReturnType; - static const int NumDimensions = DerivedTraits::NumDimensions; - - // Generic nullary operation support. - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseNullaryOp - nullaryExpr(const CustomNullaryOp& func) const { - return TensorCwiseNullaryOp(derived(), func); - } - - // Coefficient-wise nullary operators - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> - constant(const Scalar& value) const { - return nullaryExpr(internal::scalar_constant_op(value)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> - random() const { - return nullaryExpr(internal::UniformRandomGenerator()); - } - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseNullaryOp - random(const RandomGenerator& gen = RandomGenerator()) const { - return nullaryExpr(gen); - } - - // Tensor generation - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorGeneratorOp - generate(const Generator& generator) const { - return TensorGeneratorOp(derived(), generator); - } - - // Generic unary operation support. - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp - unaryExpr(const CustomUnaryOp& func) const { - return TensorCwiseUnaryOp(derived(), func); - } - - // Coefficient-wise unary operators - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - operator-() const { - return unaryExpr(internal::scalar_opposite_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - sqrt() const { - return unaryExpr(internal::scalar_sqrt_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - sign() const { - return unaryExpr(internal::scalar_sign_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - rsqrt() const { - return unaryExpr(internal::scalar_rsqrt_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - square() const { - return unaryExpr(internal::scalar_square_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - cube() const { - return unaryExpr(internal::scalar_cube_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - inverse() const { - return unaryExpr(internal::scalar_inverse_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - tanh() const { - return unaryExpr(internal::scalar_tanh_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - lgamma() const { - return unaryExpr(internal::scalar_lgamma_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - digamma() const { - return unaryExpr(internal::scalar_digamma_op()); - } - - // igamma(a = this, x = other) - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - igamma(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_igamma_op()); - } - - // igammac(a = this, x = other) - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - igammac(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_igammac_op()); - } - - // zeta(x = this, q = other) - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - zeta(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_zeta_op()); - } - - // polygamma(n = this, x = other) - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - polygamma(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_polygamma_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - erf() const { - return unaryExpr(internal::scalar_erf_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - erfc() const { - return unaryExpr(internal::scalar_erfc_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - sigmoid() const { - return unaryExpr(internal::scalar_sigmoid_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - exp() const { - return unaryExpr(internal::scalar_exp_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - log() const { - return unaryExpr(internal::scalar_log_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - log1p() const { - return unaryExpr(internal::scalar_log1p_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - abs() const { - return unaryExpr(internal::scalar_abs_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - conjugate() const { - return unaryExpr(internal::scalar_conjugate_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> - pow(Scalar exponent) const { - return unaryExpr(internal::bind2nd_op >(exponent)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - real() const { - return unaryExpr(internal::scalar_real_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - imag() const { - return unaryExpr(internal::scalar_imag_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> - operator+ (Scalar rhs) const { - return unaryExpr(internal::bind2nd_op >(rhs)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE friend - const TensorCwiseUnaryOp >, const Derived> - operator+ (Scalar lhs, const Derived& rhs) { - return rhs.unaryExpr(internal::bind1st_op >(lhs)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> - operator- (Scalar rhs) const { - EIGEN_STATIC_ASSERT((NumTraits::IsSigned || internal::is_same >::value), YOU_MADE_A_PROGRAMMING_MISTAKE); - return unaryExpr(internal::bind2nd_op >(rhs)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE friend - const TensorCwiseUnaryOp >, const Derived> - operator- (Scalar lhs, const Derived& rhs) { - return rhs.unaryExpr(internal::bind1st_op >(lhs)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> - operator* (Scalar rhs) const { - return unaryExpr(internal::bind2nd_op >(rhs)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE friend - const TensorCwiseUnaryOp >, const Derived> - operator* (Scalar lhs, const Derived& rhs) { - return rhs.unaryExpr(internal::bind1st_op >(lhs)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> - operator/ (Scalar rhs) const { - return unaryExpr(internal::bind2nd_op >(rhs)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE friend - const TensorCwiseUnaryOp >, const Derived> - operator/ (Scalar lhs, const Derived& rhs) { - return rhs.unaryExpr(internal::bind1st_op >(lhs)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - operator% (Scalar rhs) const { - EIGEN_STATIC_ASSERT(NumTraits::IsInteger, YOU_MADE_A_PROGRAMMING_MISTAKE_TRY_MOD); - return unaryExpr(internal::scalar_mod_op(rhs)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > - cwiseMax(Scalar threshold) const { - return cwiseMax(constant(threshold)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > - cwiseMin(Scalar threshold) const { - return cwiseMin(constant(threshold)); - } - - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorConversionOp - cast() const { - return TensorConversionOp(derived()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - round() const { - return unaryExpr(internal::scalar_round_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - ceil() const { - return unaryExpr(internal::scalar_ceil_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - floor() const { - return unaryExpr(internal::scalar_floor_op()); - } - - // Generic binary operation support. - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp - binaryExpr(const OtherDerived& other, const CustomBinaryOp& func) const { - return TensorCwiseBinaryOp(derived(), other, func); - } - - // Coefficient-wise binary operators. - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator+(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_sum_op()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator-(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_difference_op()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator*(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_product_op()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator/(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_quotient_op()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - cwiseMax(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_max_op()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - cwiseMin(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_min_op()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp - operator&&(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_boolean_and_op()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp - operator||(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_boolean_or_op()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp - operator^(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_boolean_xor_op()); - } - - // Comparisons and tests. - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator<(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op()); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator<=(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op()); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator>(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op()); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator>=(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator==(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator!=(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_cmp_op()); - } - - // comparisons and tests for Scalars - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > - operator<(Scalar threshold) const { - return operator<(constant(threshold)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > - operator<=(Scalar threshold) const { - return operator<=(constant(threshold)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > - operator>(Scalar threshold) const { - return operator>(constant(threshold)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > - operator>=(Scalar threshold) const { - return operator>=(constant(threshold)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > - operator==(Scalar threshold) const { - return operator==(constant(threshold)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > - operator!=(Scalar threshold) const { - return operator!=(constant(threshold)); - } - - // Checks - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - (isnan)() const { - return unaryExpr(internal::scalar_isnan_op()); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - (isinf)() const { - return unaryExpr(internal::scalar_isinf_op()); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - (isfinite)() const { - return unaryExpr(internal::scalar_isfinite_op()); - } - - // Coefficient-wise ternary operators. - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorSelectOp - select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const { - return TensorSelectOp(derived(), thenTensor.derived(), elseTensor.derived()); - } - - // Contractions. - typedef Eigen::IndexPair DimensionPair; - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorContractionOp - contract(const OtherDerived& other, const Dimensions& dims) const { - return TensorContractionOp(derived(), other.derived(), dims); - } - - // Convolutions. - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorConvolutionOp - convolve(const KernelDerived& kernel, const Dimensions& dims) const { - return TensorConvolutionOp(derived(), kernel.derived(), dims); - } - - // Fourier transforms - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorFFTOp - fft(const FFT& fft) const { - return TensorFFTOp(derived(), fft); - } - - // Scan. - typedef TensorScanOp, const Derived> TensorScanSumOp; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorScanSumOp - cumsum(const Index& axis, bool exclusive = false) const { - return TensorScanSumOp(derived(), axis, exclusive); - } - - typedef TensorScanOp, const Derived> TensorScanProdOp; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorScanProdOp - cumprod(const Index& axis, bool exclusive = false) const { - return TensorScanProdOp(derived(), axis, exclusive); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorScanOp - scan(const Index& axis, const Reducer& reducer, bool exclusive = false) const { - return TensorScanOp(derived(), axis, exclusive, reducer); - } - - // Reductions. - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const Dims, const Derived> - sum(const Dims& dims) const { - return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::SumReducer()); - } - - const TensorReductionOp, const DimensionList, const Derived> - sum() const { - DimensionList in_dims; - return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::SumReducer()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const Dims, const Derived> - mean(const Dims& dims) const { - return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MeanReducer()); - } - - const TensorReductionOp, const DimensionList, const Derived> - mean() const { - DimensionList in_dims; - return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::MeanReducer()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const Dims, const Derived> - prod(const Dims& dims) const { - return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::ProdReducer()); - } - - const TensorReductionOp, const DimensionList, const Derived> - prod() const { - DimensionList in_dims; - return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::ProdReducer()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const Dims, const Derived> - maximum(const Dims& dims) const { - return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MaxReducer()); - } - - const TensorReductionOp, const DimensionList, const Derived> - maximum() const { - DimensionList in_dims; - return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::MaxReducer()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const Dims, const Derived> - minimum(const Dims& dims) const { - return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MinReducer()); - } - - const TensorReductionOp, const DimensionList, const Derived> - minimum() const { - DimensionList in_dims; - return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::MinReducer()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp > - all(const Dims& dims) const { - return cast().reduce(dims, internal::AndReducer()); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const TensorConversionOp > - all() const { - DimensionList in_dims; - return cast().reduce(in_dims, internal::AndReducer()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp > - any(const Dims& dims) const { - return cast().reduce(dims, internal::OrReducer()); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const TensorConversionOp > - any() const { - DimensionList in_dims; - return cast().reduce(in_dims, internal::OrReducer()); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorTupleReducerOp< - internal::ArgMaxTupleReducer >, - const array, const Derived> - argmax() const { - array in_dims; - for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d; - return TensorTupleReducerOp< - internal::ArgMaxTupleReducer >, - const array, - const Derived>(derived(), internal::ArgMaxTupleReducer >(), -1, in_dims); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorTupleReducerOp< - internal::ArgMinTupleReducer >, - const array, const Derived> - argmin() const { - array in_dims; - for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d; - return TensorTupleReducerOp< - internal::ArgMinTupleReducer >, - const array, - const Derived>(derived(), internal::ArgMinTupleReducer >(), -1, in_dims); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorTupleReducerOp< - internal::ArgMaxTupleReducer >, - const array, const Derived> - argmax(const int return_dim) const { - array in_dims; - in_dims[0] = return_dim; - return TensorTupleReducerOp< - internal::ArgMaxTupleReducer >, - const array, - const Derived>(derived(), internal::ArgMaxTupleReducer >(), return_dim, in_dims); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorTupleReducerOp< - internal::ArgMinTupleReducer >, - const array, const Derived> - argmin(const int return_dim) const { - array in_dims; - in_dims[0] = return_dim; - return TensorTupleReducerOp< - internal::ArgMinTupleReducer >, - const array, - const Derived>(derived(), internal::ArgMinTupleReducer >(), return_dim, in_dims); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp - reduce(const Dims& dims, const Reducer& reducer) const { - return TensorReductionOp(derived(), dims, reducer); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorBroadcastingOp - broadcast(const Broadcast& broadcast) const { - return TensorBroadcastingOp(derived(), broadcast); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorConcatenationOp - concatenate(const OtherDerived& other, Axis axis) const { - return TensorConcatenationOp(derived(), other.derived(), axis); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorPatchOp - extract_patches(const PatchDims& patch_dims) const { - return TensorPatchOp(derived(), patch_dims); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorImagePatchOp - extract_image_patches(const Index patch_rows = 1, const Index patch_cols = 1, - const Index row_stride = 1, const Index col_stride = 1, - const Index in_row_stride = 1, const Index in_col_stride = 1, - const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const { - return TensorImagePatchOp(derived(), patch_rows, patch_cols, row_stride, col_stride, - in_row_stride, in_col_stride, 1, 1, padding_type, padding_value); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorImagePatchOp - extract_image_patches(const Index patch_rows, const Index patch_cols, - const Index row_stride, const Index col_stride, - const Index in_row_stride, const Index in_col_stride, - const Index row_inflate_stride, const Index col_inflate_stride, - const Index padding_top, const Index padding_bottom, - const Index padding_left,const Index padding_right, - const Scalar padding_value) const { - return TensorImagePatchOp(derived(), patch_rows, patch_cols, row_stride, col_stride, - in_row_stride, in_col_stride, row_inflate_stride, col_inflate_stride, - padding_top, padding_bottom, padding_left, padding_right, padding_value); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorVolumePatchOp - extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols, - const Index plane_stride = 1, const Index row_stride = 1, const Index col_stride = 1, - const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const { - return TensorVolumePatchOp(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, 1, 1, 1, padding_type, padding_value); - } - - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorVolumePatchOp - extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols, - const Index plane_stride, const Index row_stride, const Index col_stride, - const Index plane_inflate_stride, const Index row_inflate_stride, const Index col_inflate_stride, - const Index padding_top_z, const Index padding_bottom_z, - const Index padding_top, const Index padding_bottom, - const Index padding_left, const Index padding_right, const Scalar padding_value = Scalar(0)) const { - return TensorVolumePatchOp(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, plane_inflate_stride, row_inflate_stride, col_inflate_stride, padding_top_z, padding_bottom_z, padding_top, padding_bottom, padding_left, padding_right, padding_value); - } - - // Morphing operators. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorLayoutSwapOp - swap_layout() const { - return TensorLayoutSwapOp(derived()); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReshapingOp - reshape(const NewDimensions& newDimensions) const { - return TensorReshapingOp(derived(), newDimensions); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorSlicingOp - slice(const StartIndices& startIndices, const Sizes& sizes) const { - return TensorSlicingOp(derived(), startIndices, sizes); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorStridingSlicingOp - stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const { - return TensorStridingSlicingOp(derived(), startIndices, stopIndices, strides); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorChippingOp - chip(const Index offset) const { - return TensorChippingOp(derived(), offset, DimId); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorChippingOp - chip(const Index offset, const Index dim) const { - return TensorChippingOp(derived(), offset, dim); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReverseOp - reverse(const ReverseDimensions& rev) const { - return TensorReverseOp(derived(), rev); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorPaddingOp - pad(const PaddingDimensions& padding) const { - return TensorPaddingOp(derived(), padding, internal::scalar_cast_op()(0)); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorPaddingOp - pad(const PaddingDimensions& padding, const Scalar padding_value) const { - return TensorPaddingOp(derived(), padding, padding_value); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorShufflingOp - shuffle(const Shuffle& shuffle) const { - return TensorShufflingOp(derived(), shuffle); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorStridingOp - stride(const Strides& strides) const { - return TensorStridingOp(derived(), strides); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorInflationOp - inflate(const Strides& strides) const { - return TensorInflationOp(derived(), strides); - } - - // Returns a tensor containing index/value tuples - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorIndexTupleOp - index_tuples() const { - return TensorIndexTupleOp(derived()); - } - - // Support for custom unary and binary operations - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCustomUnaryOp customOp(const CustomUnaryFunc& op) const { - return TensorCustomUnaryOp(derived(), op); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCustomBinaryOp customOp(const OtherDerived& other, const CustomBinaryFunc& op) const { - return TensorCustomBinaryOp(derived(), other, op); - } - - // Force the evaluation of the expression. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorForcedEvalOp eval() const { - return TensorForcedEvalOp(derived()); - } - - protected: - template friend class Tensor; - template friend class TensorFixedSize; - template friend class TensorBase; - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast(this); } -}; - -template::value> -class TensorBase : public TensorBase { - public: - typedef internal::traits DerivedTraits; - typedef typename DerivedTraits::Scalar Scalar; - typedef typename DerivedTraits::Index Index; - typedef Scalar CoeffReturnType; - static const int NumDimensions = DerivedTraits::NumDimensions; - - template friend class Tensor; - template friend class TensorFixedSize; - template friend class TensorBase; - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Derived& setZero() { - return setConstant(Scalar(0)); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Derived& setConstant(const Scalar& val) { - return derived() = this->constant(val); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Derived& setRandom() { - return derived() = this->random(); - } - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Derived& setRandom() { - return derived() = this->template random(); - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Derived& setValues( - const typename internal::Initializer::InitList& vals) { - TensorEvaluator eval(derived(), DefaultDevice()); - internal::initialize_tensor(eval, vals); - return derived(); - } -#endif // EIGEN_HAS_VARIADIC_TEMPLATES - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Derived& operator+=(const OtherDerived& other) { - return derived() = derived() + other.derived(); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Derived& operator-=(const OtherDerived& other) { - return derived() = derived() - other.derived(); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Derived& operator*=(const OtherDerived& other) { - return derived() = derived() * other.derived(); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Derived& operator/=(const OtherDerived& other) { - return derived() = derived() / other.derived(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorLayoutSwapOp - swap_layout() const { - return TensorLayoutSwapOp(derived()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorLayoutSwapOp - swap_layout() { - return TensorLayoutSwapOp(derived()); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorConcatenationOp - concatenate(const OtherDerived& other, const Axis& axis) const { - return TensorConcatenationOp(derived(), other, axis); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorConcatenationOp - concatenate(const OtherDerived& other, const Axis& axis) { - return TensorConcatenationOp(derived(), other, axis); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReshapingOp - reshape(const NewDimensions& newDimensions) const { - return TensorReshapingOp(derived(), newDimensions); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorReshapingOp - reshape(const NewDimensions& newDimensions) { - return TensorReshapingOp(derived(), newDimensions); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorSlicingOp - slice(const StartIndices& startIndices, const Sizes& sizes) const { - return TensorSlicingOp(derived(), startIndices, sizes); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorSlicingOp - slice(const StartIndices& startIndices, const Sizes& sizes) { - return TensorSlicingOp(derived(), startIndices, sizes); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorStridingSlicingOp - stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const { - return TensorStridingSlicingOp(derived(), startIndices, stopIndices, strides); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorStridingSlicingOp - stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) { - return TensorStridingSlicingOp(derived(), startIndices, stopIndices, strides); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorChippingOp - chip(const Index offset) const { - return TensorChippingOp(derived(), offset, DimId); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorChippingOp - chip(const Index offset) { - return TensorChippingOp(derived(), offset, DimId); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorChippingOp - chip(const Index offset, const Index dim) const { - return TensorChippingOp(derived(), offset, dim); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorChippingOp - chip(const Index offset, const Index dim) { - return TensorChippingOp(derived(), offset, dim); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReverseOp - reverse(const ReverseDimensions& rev) const { - return TensorReverseOp(derived(), rev); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorReverseOp - reverse(const ReverseDimensions& rev) { - return TensorReverseOp(derived(), rev); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorShufflingOp - shuffle(const Shuffle& shuffle) const { - return TensorShufflingOp(derived(), shuffle); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorShufflingOp - shuffle(const Shuffle& shuffle) { - return TensorShufflingOp(derived(), shuffle); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorStridingOp - stride(const Strides& strides) const { - return TensorStridingOp(derived(), strides); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorStridingOp - stride(const Strides& strides) { - return TensorStridingOp(derived(), strides); - } - - // Select the device on which to evaluate the expression. - template - TensorDevice device(const DeviceType& device) { - return TensorDevice(device, derived()); - } - - protected: - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Derived& derived() { return *static_cast(this); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast(this); } -}; -#endif // EIGEN_PARSED_BY_DOXYGEN -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h deleted file mode 100644 index 4cfe300..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ /dev/null @@ -1,392 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H -#define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H - -namespace Eigen { - -/** \class TensorBroadcasting - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor broadcasting class. - * - * - */ -namespace internal { -template -struct traits > : public traits -{ - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorBroadcastingOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorBroadcastingOp type; -}; - -template -struct is_input_scalar { - static const bool value = false; -}; -template <> -struct is_input_scalar > { - static const bool value = true; -}; -#ifndef EIGEN_EMULATE_CXX11_META_H -template -struct is_input_scalar > { - static const bool value = (Sizes::total_size == 1); -}; -#endif - -} // end namespace internal - - - -template -class TensorBroadcastingOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast) - : m_xpr(expr), m_broadcast(broadcast) {} - - EIGEN_DEVICE_FUNC - const Broadcast& broadcast() const { return m_broadcast; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; - const Broadcast m_broadcast; -}; - - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorBroadcastingOp XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size::Dimensions>::value; - typedef DSizes Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename TensorEvaluator::Dimensions InputDimensions; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = true, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_broadcast(op.broadcast()),m_impl(op.expression(), device) - { - // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar - // and store the result in a scalar. Instead one should reshape the scalar into a a N-D - // tensor with N >= 1 of 1 element first and then broadcast. - EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); - const InputDimensions& input_dims = m_impl.dimensions(); - const Broadcast& broadcast = op.broadcast(); - for (int i = 0; i < NumDims; ++i) { - eigen_assert(input_dims[i] > 0); - m_dimensions[i] = input_dims[i] * broadcast[i]; - } - - if (static_cast(Layout) == static_cast(ColMajor)) { - m_inputStrides[0] = 1; - m_outputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; - } - } else { - m_inputStrides[NumDims-1] = 1; - m_outputStrides[NumDims-1] = 1; - for (int i = NumDims-2; i >= 0; --i) { - m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; - m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const - { - if (internal::is_input_scalar::type>::value) { - return m_impl.coeff(0); - } - - if (static_cast(Layout) == static_cast(ColMajor)) { - return coeffColMajor(index); - } else { - return coeffRowMajor(index); - } - } - - // TODO: attempt to speed this up. The integer divisions and modulo are slow - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const - { - Index inputIndex = 0; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; - if (internal::index_statically_eq(i, 1)) { - eigen_assert(idx < m_impl.dimensions()[i]); - inputIndex += idx * m_inputStrides[i]; - } else { - if (internal::index_statically_eq(i, 1)) { - eigen_assert(idx % m_impl.dimensions()[i] == 0); - } else { - inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; - } - } - index -= idx * m_outputStrides[i]; - } - if (internal::index_statically_eq(0, 1)) { - eigen_assert(index < m_impl.dimensions()[0]); - inputIndex += index; - } else { - if (internal::index_statically_eq(0, 1)) { - eigen_assert(index % m_impl.dimensions()[0] == 0); - } else { - inputIndex += (index % m_impl.dimensions()[0]); - } - } - return m_impl.coeff(inputIndex); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const - { - Index inputIndex = 0; - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_outputStrides[i]; - if (internal::index_statically_eq(i, 1)) { - eigen_assert(idx < m_impl.dimensions()[i]); - inputIndex += idx * m_inputStrides[i]; - } else { - if (internal::index_statically_eq(i, 1)) { - eigen_assert(idx % m_impl.dimensions()[i] == 0); - } else { - inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; - } - } - index -= idx * m_outputStrides[i]; - } - if (internal::index_statically_eq(NumDims-1, 1)) { - eigen_assert(index < m_impl.dimensions()[NumDims-1]); - inputIndex += index; - } else { - if (internal::index_statically_eq(NumDims-1, 1)) { - eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0); - } else { - inputIndex += (index % m_impl.dimensions()[NumDims-1]); - } - } - return m_impl.coeff(inputIndex); - } - - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const - { - if (internal::is_input_scalar::type>::value) { - return internal::pset1(m_impl.coeff(0)); - } - - if (static_cast(Layout) == static_cast(ColMajor)) { - return packetColMajor(index); - } else { - return packetRowMajor(index); - } - } - - // Ignore the LoadMode and always use unaligned loads since we can't guarantee - // the alignment at compile time. - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - const Index originalIndex = index; - - Index inputIndex = 0; - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; - if (internal::index_statically_eq(i, 1)) { - eigen_assert(idx < m_impl.dimensions()[i]); - inputIndex += idx * m_inputStrides[i]; - } else { - if (internal::index_statically_eq(i, 1)) { - eigen_assert(idx % m_impl.dimensions()[i] == 0); - } else { - inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; - } - } - index -= idx * m_outputStrides[i]; - } - Index innermostLoc; - if (internal::index_statically_eq(0, 1)) { - eigen_assert(index < m_impl.dimensions()[0]); - innermostLoc = index; - } else { - if (internal::index_statically_eq(0, 1)) { - eigen_assert(index % m_impl.dimensions()[0] == 0); - innermostLoc = 0; - } else { - innermostLoc = index % m_impl.dimensions()[0]; - } - } - inputIndex += innermostLoc; - - // Todo: this could be extended to the second dimension if we're not - // broadcasting alongside the first dimension, and so on. - if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) { - return m_impl.template packet(inputIndex); - } else { - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - values[0] = m_impl.coeff(inputIndex); - for (int i = 1; i < PacketSize; ++i) { - values[i] = coeffColMajor(originalIndex+i); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - const Index originalIndex = index; - - Index inputIndex = 0; - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_outputStrides[i]; - if (internal::index_statically_eq(i, 1)) { - eigen_assert(idx < m_impl.dimensions()[i]); - inputIndex += idx * m_inputStrides[i]; - } else { - if (internal::index_statically_eq(i, 1)) { - eigen_assert(idx % m_impl.dimensions()[i] == 0); - } else { - inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; - } - } - index -= idx * m_outputStrides[i]; - } - Index innermostLoc; - if (internal::index_statically_eq(NumDims-1, 1)) { - eigen_assert(index < m_impl.dimensions()[NumDims-1]); - innermostLoc = index; - } else { - if (internal::index_statically_eq(NumDims-1, 1)) { - eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0); - innermostLoc = 0; - } else { - innermostLoc = index % m_impl.dimensions()[NumDims-1]; - } - } - inputIndex += innermostLoc; - - // Todo: this could be extended to the second dimension if we're not - // broadcasting alongside the first dimension, and so on. - if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims-1]) { - return m_impl.template packet(inputIndex); - } else { - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - values[0] = m_impl.coeff(inputIndex); - for (int i = 1; i < PacketSize; ++i) { - values[i] = coeffRowMajor(originalIndex+i); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - double compute_cost = TensorOpCost::AddCost(); - if (NumDims > 0) { - for (int i = NumDims - 1; i > 0; --i) { - compute_cost += TensorOpCost::DivCost(); - if (internal::index_statically_eq(i, 1)) { - compute_cost += - TensorOpCost::MulCost() + TensorOpCost::AddCost(); - } else { - if (!internal::index_statically_eq(i, 1)) { - compute_cost += TensorOpCost::MulCost() + - TensorOpCost::ModCost() + - TensorOpCost::AddCost(); - } - } - compute_cost += - TensorOpCost::MulCost() + TensorOpCost::AddCost(); - } - } - return m_impl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - - const TensorEvaluator& impl() const { return m_impl; } - - Broadcast functor() const { return m_broadcast; } - - protected: - const Broadcast m_broadcast; - Dimensions m_dimensions; - array m_outputStrides; - array m_inputStrides; - TensorEvaluator m_impl; -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h deleted file mode 100644 index 1ba7ef1..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ /dev/null @@ -1,384 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H -#define EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H - -namespace Eigen { - -/** \class TensorKChippingReshaping - * \ingroup CXX11_Tensor_Module - * - * \brief A chip is a thin slice, corresponding to a column or a row in a 2-d tensor. - * - * - */ - -namespace internal { -template -struct traits > : public traits -{ - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions - 1; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorChippingOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorChippingOp type; -}; - -template -struct DimensionId -{ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) { - eigen_assert(dim == DimId); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { - return DimId; - } -}; -template <> -struct DimensionId -{ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) : actual_dim(dim) { - eigen_assert(dim >= 0); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { - return actual_dim; - } - private: - const DenseIndex actual_dim; -}; - - -} // end namespace internal - - - -template -class TensorChippingOp : public TensorBase > -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim) - : m_xpr(expr), m_offset(offset), m_dim(dim) { - } - - EIGEN_DEVICE_FUNC - const Index offset() const { return m_offset; } - EIGEN_DEVICE_FUNC - const Index dim() const { return m_dim.actualDim(); } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorChippingOp& operator = (const TensorChippingOp& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorChippingOp& operator = (const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - protected: - typename XprType::Nested m_xpr; - const Index m_offset; - const internal::DimensionId m_dim; -}; - - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorChippingOp XprType; - static const int NumInputDims = internal::array_size::Dimensions>::value; - static const int NumDims = NumInputDims-1; - typedef typename XprType::Index Index; - typedef DSizes Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - - enum { - // Alignment can't be guaranteed at compile time since it depends on the - // slice offsets. - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device) - { - EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); - eigen_assert(NumInputDims > m_dim.actualDim()); - - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - eigen_assert(op.offset() < input_dims[m_dim.actualDim()]); - - int j = 0; - for (int i = 0; i < NumInputDims; ++i) { - if (i != m_dim.actualDim()) { - m_dimensions[j] = input_dims[i]; - ++j; - } - } - - m_stride = 1; - m_inputStride = 1; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = 0; i < m_dim.actualDim(); ++i) { - m_stride *= input_dims[i]; - m_inputStride *= input_dims[i]; - } - } else { - for (int i = NumInputDims-1; i > m_dim.actualDim(); --i) { - m_stride *= input_dims[i]; - m_inputStride *= input_dims[i]; - } - } - m_inputStride *= input_dims[m_dim.actualDim()]; - m_inputOffset = m_stride * op.offset(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return m_impl.coeff(srcCoeff(index)); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { - // m_stride is equal to 1, so let's avoid the integer division. - eigen_assert(m_stride == 1); - Index inputIndex = index * m_inputStride + m_inputOffset; - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - for (int i = 0; i < PacketSize; ++i) { - values[i] = m_impl.coeff(inputIndex); - inputIndex += m_inputStride; - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims - 1) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { - // m_stride is aways greater than index, so let's avoid the integer division. - eigen_assert(m_stride > index); - return m_impl.template packet(index + m_inputOffset); - } else { - const Index idx = index / m_stride; - const Index rem = index - idx * m_stride; - if (rem + PacketSize <= m_stride) { - Index inputIndex = idx * m_inputStride + m_inputOffset + rem; - return m_impl.template packet(inputIndex); - } else { - // Cross the stride boundary. Fallback to slow path. - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - for (int i = 0; i < PacketSize; ++i) { - values[i] = coeff(index); - ++index; - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - double cost = 0; - if ((static_cast(Layout) == static_cast(ColMajor) && - m_dim.actualDim() == 0) || - (static_cast(Layout) == static_cast(RowMajor) && - m_dim.actualDim() == NumInputDims - 1)) { - cost += TensorOpCost::MulCost() + TensorOpCost::AddCost(); - } else if ((static_cast(Layout) == static_cast(ColMajor) && - m_dim.actualDim() == NumInputDims - 1) || - (static_cast(Layout) == static_cast(RowMajor) && - m_dim.actualDim() == 0)) { - cost += TensorOpCost::AddCost(); - } else { - cost += 3 * TensorOpCost::MulCost() + TensorOpCost::DivCost() + - 3 * TensorOpCost::AddCost(); - } - - return m_impl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, cost, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { - CoeffReturnType* result = const_cast(m_impl.data()); - if (((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumDims) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) && - result) { - return result + m_inputOffset; - } else { - return NULL; - } - } - - protected: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const - { - Index inputIndex; - if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { - // m_stride is equal to 1, so let's avoid the integer division. - eigen_assert(m_stride == 1); - inputIndex = index * m_inputStride + m_inputOffset; - } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims-1) || - (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { - // m_stride is aways greater than index, so let's avoid the integer division. - eigen_assert(m_stride > index); - inputIndex = index + m_inputOffset; - } else { - const Index idx = index / m_stride; - inputIndex = idx * m_inputStride + m_inputOffset; - index -= idx * m_stride; - inputIndex += index; - } - return inputIndex; - } - - Dimensions m_dimensions; - Index m_stride; - Index m_inputOffset; - Index m_inputStride; - TensorEvaluator m_impl; - const internal::DimensionId m_dim; - const Device& m_device; -}; - - -// Eval as lvalue -template -struct TensorEvaluator, Device> - : public TensorEvaluator, Device> -{ - typedef TensorEvaluator, Device> Base; - typedef TensorChippingOp XprType; - static const int NumInputDims = internal::array_size::Dimensions>::value; - static const int NumDims = NumInputDims-1; - typedef typename XprType::Index Index; - typedef DSizes Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : Base(op, device) - { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) - { - return this->m_impl.coeffRef(this->srcCoeff(index)); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - - if ((static_cast(this->Layout) == static_cast(ColMajor) && this->m_dim.actualDim() == 0) || - (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) { - // m_stride is equal to 1, so let's avoid the integer division. - eigen_assert(this->m_stride == 1); - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - internal::pstore(values, x); - Index inputIndex = index * this->m_inputStride + this->m_inputOffset; - for (int i = 0; i < PacketSize; ++i) { - this->m_impl.coeffRef(inputIndex) = values[i]; - inputIndex += this->m_inputStride; - } - } else if ((static_cast(this->Layout) == static_cast(ColMajor) && this->m_dim.actualDim() == NumInputDims-1) || - (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == 0)) { - // m_stride is aways greater than index, so let's avoid the integer division. - eigen_assert(this->m_stride > index); - this->m_impl.template writePacket(index + this->m_inputOffset, x); - } else { - const Index idx = index / this->m_stride; - const Index rem = index - idx * this->m_stride; - if (rem + PacketSize <= this->m_stride) { - const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem; - this->m_impl.template writePacket(inputIndex, x); - } else { - // Cross stride boundary. Fallback to slow path. - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - internal::pstore(values, x); - for (int i = 0; i < PacketSize; ++i) { - this->coeffRef(index) = values[i]; - ++index; - } - } - } - } -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h deleted file mode 100644 index 59bf90d..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ /dev/null @@ -1,361 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H - -namespace Eigen { - -/** \class TensorConcatenationOp - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor concatenation class. - * - * - */ -namespace internal { -template -struct traits > -{ - // Type promotion to handle the case where the types of the lhs and the rhs are different. - typedef typename promote_storage_type::ret Scalar; - typedef typename promote_storage_type::StorageKind, - typename traits::StorageKind>::ret StorageKind; - typedef typename promote_index_type::Index, - typename traits::Index>::type Index; - typedef typename LhsXprType::Nested LhsNested; - typedef typename RhsXprType::Nested RhsNested; - typedef typename remove_reference::type _LhsNested; - typedef typename remove_reference::type _RhsNested; - static const int NumDimensions = traits::NumDimensions; - static const int Layout = traits::Layout; - enum { Flags = 0 }; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorConcatenationOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorConcatenationOp type; -}; - -} // end namespace internal - - -template -class TensorConcatenationOp : public TensorBase, WriteAccessors> -{ - public: - typedef typename internal::traits::Scalar Scalar; - typedef typename internal::traits::StorageKind StorageKind; - typedef typename internal::traits::Index Index; - typedef typename internal::nested::type Nested; - typedef typename internal::promote_storage_type::ret CoeffReturnType; - typedef typename NumTraits::Real RealScalar; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConcatenationOp(const LhsXprType& lhs, const RhsXprType& rhs, Axis axis) - : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_axis(axis) {} - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - lhsExpression() const { return m_lhs_xpr; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - rhsExpression() const { return m_rhs_xpr; } - - EIGEN_DEVICE_FUNC const Axis& axis() const { return m_axis; } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorConcatenationOp& operator = (const TensorConcatenationOp& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorConcatenationOp& operator = (const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - protected: - typename LhsXprType::Nested m_lhs_xpr; - typename RhsXprType::Nested m_rhs_xpr; - const Axis m_axis; -}; - - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorConcatenationOp XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size::Dimensions>::value; - static const int RightNumDims = internal::array_size::Dimensions>::value; - typedef DSizes Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis()) - { - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT((NumDims == RightNumDims), YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); - - eigen_assert(0 <= m_axis && m_axis < NumDims); - const Dimensions& lhs_dims = m_leftImpl.dimensions(); - const Dimensions& rhs_dims = m_rightImpl.dimensions(); - { - int i = 0; - for (; i < m_axis; ++i) { - eigen_assert(lhs_dims[i] > 0); - eigen_assert(lhs_dims[i] == rhs_dims[i]); - m_dimensions[i] = lhs_dims[i]; - } - eigen_assert(lhs_dims[i] > 0); // Now i == m_axis. - eigen_assert(rhs_dims[i] > 0); - m_dimensions[i] = lhs_dims[i] + rhs_dims[i]; - for (++i; i < NumDims; ++i) { - eigen_assert(lhs_dims[i] > 0); - eigen_assert(lhs_dims[i] == rhs_dims[i]); - m_dimensions[i] = lhs_dims[i]; - } - } - - if (static_cast(Layout) == static_cast(ColMajor)) { - m_leftStrides[0] = 1; - m_rightStrides[0] = 1; - m_outputStrides[0] = 1; - - for (int j = 1; j < NumDims; ++j) { - m_leftStrides[j] = m_leftStrides[j-1] * lhs_dims[j-1]; - m_rightStrides[j] = m_rightStrides[j-1] * rhs_dims[j-1]; - m_outputStrides[j] = m_outputStrides[j-1] * m_dimensions[j-1]; - } - } else { - m_leftStrides[NumDims - 1] = 1; - m_rightStrides[NumDims - 1] = 1; - m_outputStrides[NumDims - 1] = 1; - - for (int j = NumDims - 2; j >= 0; --j) { - m_leftStrides[j] = m_leftStrides[j+1] * lhs_dims[j+1]; - m_rightStrides[j] = m_rightStrides[j+1] * rhs_dims[j+1]; - m_outputStrides[j] = m_outputStrides[j+1] * m_dimensions[j+1]; - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear? - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) - { - m_leftImpl.evalSubExprsIfNeeded(NULL); - m_rightImpl.evalSubExprsIfNeeded(NULL); - return true; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() - { - m_leftImpl.cleanup(); - m_rightImpl.cleanup(); - } - - // TODO(phli): attempt to speed this up. The integer divisions and modulo are slow. - // See CL/76180724 comments for more ideas. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - // Collect dimension-wise indices (subs). - array subs; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - subs[i] = index / m_outputStrides[i]; - index -= subs[i] * m_outputStrides[i]; - } - subs[0] = index; - } else { - for (int i = 0; i < NumDims - 1; ++i) { - subs[i] = index / m_outputStrides[i]; - index -= subs[i] * m_outputStrides[i]; - } - subs[NumDims - 1] = index; - } - - const Dimensions& left_dims = m_leftImpl.dimensions(); - if (subs[m_axis] < left_dims[m_axis]) { - Index left_index; - if (static_cast(Layout) == static_cast(ColMajor)) { - left_index = subs[0]; - for (int i = 1; i < NumDims; ++i) { - left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; - } - } else { - left_index = subs[NumDims - 1]; - for (int i = NumDims - 2; i >= 0; --i) { - left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; - } - } - return m_leftImpl.coeff(left_index); - } else { - subs[m_axis] -= left_dims[m_axis]; - const Dimensions& right_dims = m_rightImpl.dimensions(); - Index right_index; - if (static_cast(Layout) == static_cast(ColMajor)) { - right_index = subs[0]; - for (int i = 1; i < NumDims; ++i) { - right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; - } - } else { - right_index = subs[NumDims - 1]; - for (int i = NumDims - 2; i >= 0; --i) { - right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; - } - } - return m_rightImpl.coeff(right_index); - } - } - - // TODO(phli): Add a real vectorization. - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); - - EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; - for (int i = 0; i < packetSize; ++i) { - values[i] = coeff(index+i); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - const double compute_cost = NumDims * (2 * TensorOpCost::AddCost() + - 2 * TensorOpCost::MulCost() + - TensorOpCost::DivCost() + - TensorOpCost::ModCost()); - const double lhs_size = m_leftImpl.dimensions().TotalSize(); - const double rhs_size = m_rightImpl.dimensions().TotalSize(); - return (lhs_size / (lhs_size + rhs_size)) * - m_leftImpl.costPerCoeff(vectorized) + - (rhs_size / (lhs_size + rhs_size)) * - m_rightImpl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, compute_cost); - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - - protected: - Dimensions m_dimensions; - array m_outputStrides; - array m_leftStrides; - array m_rightStrides; - TensorEvaluator m_leftImpl; - TensorEvaluator m_rightImpl; - const Axis m_axis; -}; - -// Eval as lvalue -template - struct TensorEvaluator, Device> - : public TensorEvaluator, Device> -{ - typedef TensorEvaluator, Device> Base; - typedef TensorConcatenationOp XprType; - typedef typename Base::Dimensions Dimensions; - enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device) - : Base(op, device) - { - EIGEN_STATIC_ASSERT((static_cast(Layout) == static_cast(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE); - } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) - { - // Collect dimension-wise indices (subs). - array subs; - for (int i = Base::NumDims - 1; i > 0; --i) { - subs[i] = index / this->m_outputStrides[i]; - index -= subs[i] * this->m_outputStrides[i]; - } - subs[0] = index; - - const Dimensions& left_dims = this->m_leftImpl.dimensions(); - if (subs[this->m_axis] < left_dims[this->m_axis]) { - Index left_index = subs[0]; - for (int i = 1; i < Base::NumDims; ++i) { - left_index += (subs[i] % left_dims[i]) * this->m_leftStrides[i]; - } - return this->m_leftImpl.coeffRef(left_index); - } else { - subs[this->m_axis] -= left_dims[this->m_axis]; - const Dimensions& right_dims = this->m_rightImpl.dimensions(); - Index right_index = subs[0]; - for (int i = 1; i < Base::NumDims; ++i) { - right_index += (subs[i] % right_dims[i]) * this->m_rightStrides[i]; - } - return this->m_rightImpl.coeffRef(right_index); - } - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) - { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize()); - - EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; - internal::pstore(values, x); - for (int i = 0; i < packetSize; ++i) { - coeffRef(index+i) = values[i]; - } - } -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h deleted file mode 100644 index 20b29e5..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ /dev/null @@ -1,628 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H - -namespace Eigen { - -/** \class TensorContraction - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor contraction class. - * - * - */ -namespace internal { - -template -struct traits > -{ - // Type promotion to handle the case where the types of the lhs and the rhs are different. - typedef typename gebp_traits::type, - typename remove_const::type>::ResScalar Scalar; - - typedef typename promote_storage_type::StorageKind, - typename traits::StorageKind>::ret StorageKind; - typedef typename promote_index_type::Index, - typename traits::Index>::type Index; - typedef typename LhsXprType::Nested LhsNested; - typedef typename RhsXprType::Nested RhsNested; - typedef typename remove_reference::type _LhsNested; - typedef typename remove_reference::type _RhsNested; - - // From NumDims below. - static const int NumDimensions = traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; - static const int Layout = traits::Layout; - - enum { - Flags = 0 - }; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorContractionOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorContractionOp type; -}; - -template -struct traits, Device_> > { - typedef Indices_ Indices; - typedef LeftArgType_ LeftArgType; - typedef RightArgType_ RightArgType; - typedef Device_ Device; - - // From NumDims below. - static const int NumDimensions = traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; -}; - -} // end namespace internal - -template -class TensorContractionOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename internal::gebp_traits::ResScalar CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp( - const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims) - : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {} - - EIGEN_DEVICE_FUNC - const Indices& indices() const { return m_indices; } - - /** \returns the nested expressions */ - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - lhsExpression() const { return m_lhs_xpr; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - rhsExpression() const { return m_rhs_xpr; } - - protected: - typename LhsXprType::Nested m_lhs_xpr; - typename RhsXprType::Nested m_rhs_xpr; - const Indices m_indices; -}; - - -template -struct TensorContractionEvaluatorBase -{ - typedef typename internal::traits::Indices Indices; - typedef typename internal::traits::LeftArgType LeftArgType; - typedef typename internal::traits::RightArgType RightArgType; - typedef typename internal::traits::Device Device; - - typedef TensorContractionOp XprType; - typedef typename internal::remove_const::type Scalar; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - - enum { - IsAligned = true, - PacketAccess = (internal::unpacket_traits::size > 1), - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = true - }; - - // Most of the code is assuming that both input tensors are ColMajor. If the - // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: - // If we want to compute A * B = C, where A is LHS and B is RHS, the code - // will pretend B is LHS and A is RHS. - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - - static const int LDims = - internal::array_size::Dimensions>::value; - static const int RDims = - internal::array_size::Dimensions>::value; - static const int ContractDims = internal::array_size::value; - static const int NumDims = LDims + RDims - 2 * ContractDims; - - typedef array contract_t; - typedef array left_nocontract_t; - typedef array right_nocontract_t; - - typedef DSizes Dimensions; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorContractionEvaluatorBase(const XprType& op, const Device& device) - : m_leftImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), - op.lhsExpression(), op.rhsExpression()), device), - m_rightImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), - op.rhsExpression(), op.lhsExpression()), device), - m_device(device), - m_result(NULL) { - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == - static_cast(TensorEvaluator::Layout)), - YOU_MADE_A_PROGRAMMING_MISTAKE); - - - DSizes eval_left_dims; - DSizes eval_right_dims; - array, ContractDims> eval_op_indices; - if (static_cast(Layout) == static_cast(ColMajor)) { - // For ColMajor, we keep using the existing dimensions - for (int i = 0; i < LDims; i++) { - eval_left_dims[i] = m_leftImpl.dimensions()[i]; - } - for (int i = 0; i < RDims; i++) { - eval_right_dims[i] = m_rightImpl.dimensions()[i]; - } - // We keep the pairs of contracting indices. - for (int i = 0; i < ContractDims; i++) { - eval_op_indices[i].first = op.indices()[i].first; - eval_op_indices[i].second = op.indices()[i].second; - } - } else { - // For RowMajor, we need to reverse the existing dimensions - for (int i = 0; i < LDims; i++) { - eval_left_dims[i] = m_leftImpl.dimensions()[LDims - i - 1]; - } - for (int i = 0; i < RDims; i++) { - eval_right_dims[i] = m_rightImpl.dimensions()[RDims - i - 1]; - } - // We need to flip all the pairs of contracting indices as well as - // reversing the dimensions. - for (int i = 0; i < ContractDims; i++) { - eval_op_indices[i].first = LDims - 1 - op.indices()[ContractDims - 1 - i].second; - eval_op_indices[i].second = RDims - 1 - op.indices()[ContractDims - 1 - i].first; - } - } - - // Check for duplicate axes and make sure the first index in eval_op_indices - // is increasing. Using O(n^2) sorting is OK since ContractDims is small - for (int i = 0; i < ContractDims; i++) { - for (int j = i + 1; j < ContractDims; j++) { - eigen_assert(eval_op_indices[j].first != eval_op_indices[i].first && - eval_op_indices[j].second != eval_op_indices[i].second && - "contraction axes should be unique"); - if (eval_op_indices[j].first < eval_op_indices[i].first) { - numext::swap(eval_op_indices[j], eval_op_indices[i]); - } - } - } - - array lhs_strides; - lhs_strides[0] = 1; - for (int i = 0; i < LDims-1; ++i) { - lhs_strides[i+1] = lhs_strides[i] * eval_left_dims[i]; - } - - array rhs_strides; - rhs_strides[0] = 1; - for (int i = 0; i < RDims-1; ++i) { - rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i]; - } - - if (m_i_strides.size() > 0) m_i_strides[0] = 1; - if (m_j_strides.size() > 0) m_j_strides[0] = 1; - if (m_k_strides.size() > 0) m_k_strides[0] = 1; - - m_i_size = 1; - m_j_size = 1; - m_k_size = 1; - - // To compute the dimension, we simply concatenate the non-contracting - // dimensions of the left and then the right tensor. Additionally, we also - // compute the strides corresponding to the left non-contracting - // dimensions and right non-contracting dimensions. - m_lhs_inner_dim_contiguous = true; - int dim_idx = 0; - unsigned int nocontract_idx = 0; - - for (int i = 0; i < LDims; i++) { - // find if we are contracting on index i of left tensor - bool contracting = false; - for (int j = 0; j < ContractDims; j++) { - if (eval_op_indices[j].first == i) { - contracting = true; - break; - } - } - if (!contracting) { - // add dimension size to output dimensions - m_dimensions[dim_idx] = eval_left_dims[i]; - m_left_nocontract_strides[nocontract_idx] = lhs_strides[i]; - if (dim_idx != i) { - m_lhs_inner_dim_contiguous = false; - } - if (nocontract_idx+1 < internal::array_size::value) { - m_i_strides[nocontract_idx+1] = - m_i_strides[nocontract_idx] * eval_left_dims[i]; - } else { - m_i_size = m_i_strides[nocontract_idx] * eval_left_dims[i]; - } - dim_idx++; - nocontract_idx++; - } - } - - nocontract_idx = 0; - for (int i = 0; i < RDims; i++) { - bool contracting = false; - // find if we are contracting on index i of right tensor - for (int j = 0; j < ContractDims; j++) { - if (eval_op_indices[j].second == i) { - contracting = true; - break; - } - } - if (!contracting) { - m_dimensions[dim_idx] = eval_right_dims[i]; - if (nocontract_idx+1 < internal::array_size::value) { - m_j_strides[nocontract_idx+1] = - m_j_strides[nocontract_idx] * eval_right_dims[i]; - } else { - m_j_size = m_j_strides[nocontract_idx] * eval_right_dims[i]; - } - m_right_nocontract_strides[nocontract_idx] = rhs_strides[i]; - dim_idx++; - nocontract_idx++; - } - } - - // Now compute the strides corresponding to the contracting dimensions. We - // assumed above that non-contracting axes are represented in the same order - // in the matrix as they are in the tensor. This is not the case for - // contracting axes. As the contracting axes must be of the same size in - // each tensor, we'll only look at the first tensor here. - m_rhs_inner_dim_contiguous = true; - m_rhs_inner_dim_reordered = false; - for (int i = 0; i < ContractDims; i++) { - Index left = eval_op_indices[i].first; - Index right = eval_op_indices[i].second; - - Index size = eval_left_dims[left]; - eigen_assert(size == eval_right_dims[right] && - "Contraction axes must be same size"); - - if (i+1 < static_cast(internal::array_size::value)) { - m_k_strides[i+1] = m_k_strides[i] * size; - } else { - m_k_size = m_k_strides[i] * size; - } - m_left_contracting_strides[i] = lhs_strides[left]; - m_right_contracting_strides[i] = rhs_strides[right]; - - if (i > 0 && right < eval_op_indices[i-1].second) { - m_rhs_inner_dim_reordered = true; - } - if (right != i) { - m_rhs_inner_dim_contiguous = false; - } - } - - // If the layout is RowMajor, we need to reverse the m_dimensions - if (static_cast(Layout) == static_cast(RowMajor)) { - for (int i = 0, j = NumDims - 1; i < j; i++, j--) { - numext::swap(m_dimensions[i], m_dimensions[j]); - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { - m_leftImpl.evalSubExprsIfNeeded(NULL); - m_rightImpl.evalSubExprsIfNeeded(NULL); - if (data) { - evalTo(data); - return false; - } else { - m_result = static_cast(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); - evalTo(m_result); - return true; - } - } - - EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const { - if (this->m_lhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalProduct(buffer); - } - else { - static_cast(this)->template evalProduct(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalProduct(buffer); - } - else { - static_cast(this)->template evalProduct(buffer); - } - } - } - else { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalProduct(buffer); - } - else { - static_cast(this)->template evalProduct(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - static_cast(this)->template evalProduct(buffer); - } - else { - static_cast(this)->template evalProduct(buffer); - } - } - } - } - - template - EIGEN_DEVICE_FUNC void evalGemv(Scalar* buffer) const { - const Index rows = m_i_size; - const Index cols = m_k_size; - - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; - const Index lhs_packet_size = internal::unpacket_traits::size; - const Index rhs_packet_size = internal::unpacket_traits::size; - const int lhs_alignment = LeftEvaluator::IsAligned ? Aligned : Unaligned; - const int rhs_alignment = RightEvaluator::IsAligned ? Aligned : Unaligned; - typedef internal::TensorContractionInputMapper LhsMapper; - - typedef internal::TensorContractionInputMapper RhsMapper; - - LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides, - m_left_contracting_strides, m_k_strides); - RhsMapper rhs(m_rightImpl, m_right_nocontract_strides, m_j_strides, - m_right_contracting_strides, m_k_strides); - - const Scalar alpha(1); - const Index resIncr(1); - - // zero out the result buffer (which must be of size at least rows * sizeof(Scalar) - m_device.memset(buffer, 0, rows * sizeof(Scalar)); - - internal::general_matrix_vector_product::run( - rows, cols, lhs, rhs, - buffer, resIncr, alpha); - } - - template - EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; - - // rows in left side - const Index m = this->m_i_size; - - // columns in right side - const Index n = this->m_j_size; - - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - - // define mr, nr, and all of my data mapper types - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; - typedef typename internal::gebp_traits Traits; - - const Index nr = Traits::nr; - const Index mr = Traits::mr; - - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; - - const Index lhs_packet_size = internal::unpacket_traits::size; - const Index rhs_packet_size = internal::unpacket_traits::size; - - typedef internal::TensorContractionInputMapper LhsMapper; - - typedef internal::TensorContractionInputMapper RhsMapper; - - typedef internal::blas_data_mapper OutputMapper; - - // Declare GEBP packing and kernel structs - internal::gemm_pack_lhs pack_lhs; - internal::gemm_pack_rhs pack_rhs; - - internal::gebp_kernel gebp; - - // initialize data mappers - LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, - this->m_left_contracting_strides, this->m_k_strides); - - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, - this->m_right_contracting_strides, this->m_k_strides); - - OutputMapper output(buffer, m); - - // Sizes of the blocks to load in cache. See the Goto paper for details. - internal::TensorContractionBlocking blocking(k, m, n, 1); - const Index kc = blocking.kc(); - const Index mc = numext::mini(m, blocking.mc()); - const Index nc = numext::mini(n, blocking.nc()); - const Index sizeA = mc * kc; - const Index sizeB = kc * nc; - - LhsScalar* blockA = static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar))); - RhsScalar* blockB = static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar))); - - for(Index i2=0; i2m_device.deallocate(blockA); - this->m_device.deallocate(blockB); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_leftImpl.cleanup(); - m_rightImpl.cleanup(); - - if (m_result != NULL) { - m_device.deallocate(m_result); - m_result = NULL; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_result[index]; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { - return TensorOpCost(sizeof(CoeffReturnType), 0, 0); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - return internal::ploadt(m_result + index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; } - - protected: - // Prevent assignment - TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&); - Dimensions m_dimensions; - - contract_t m_k_strides; - contract_t m_left_contracting_strides; - contract_t m_right_contracting_strides; - - bool m_lhs_inner_dim_contiguous; - bool m_rhs_inner_dim_contiguous; - bool m_rhs_inner_dim_reordered; - - left_nocontract_t m_i_strides; - right_nocontract_t m_j_strides; - left_nocontract_t m_left_nocontract_strides; - right_nocontract_t m_right_nocontract_strides; - - Index m_i_size; - Index m_j_size; - Index m_k_size; - - TensorEvaluator m_leftImpl; - TensorEvaluator m_rightImpl; - const Device& m_device; - Scalar* m_result; -}; - - -// evaluator for default device -template -struct TensorEvaluator, Device> : - public TensorContractionEvaluatorBase< - TensorEvaluator, Device> > { - typedef TensorEvaluator, Device> Self; - typedef TensorContractionEvaluatorBase Base; - - typedef TensorContractionOp XprType; - typedef typename internal::remove_const::type Scalar; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - - enum { - Layout = TensorEvaluator::Layout - }; - - // Most of the code is assuming that both input tensors are ColMajor. If the - // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: - // If we want to compute A * B = C, where A is LHS and B is RHS, the code - // will pretend B is LHS and A is RHS. - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - - static const int LDims = - internal::array_size::Dimensions>::value; - static const int RDims = - internal::array_size::Dimensions>::value; - static const int ContractDims = internal::array_size::value; - - typedef array contract_t; - typedef array left_nocontract_t; - typedef array right_nocontract_t; - - static const int NumDims = LDims + RDims - 2 * ContractDims; - - // Could we use NumDimensions here? - typedef DSizes Dimensions; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : - Base(op, device) { } - - template - EIGEN_DEVICE_FUNC void evalProduct(Scalar* buffer) const { - if (this->m_j_size == 1) { - this->template evalGemv(buffer); - return; - } - - this->template evalGemm(buffer); - } -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h deleted file mode 100644 index 5cf7b4f..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +++ /dev/null @@ -1,56 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H - - -namespace Eigen { -namespace internal { - -enum { - ShardByRow = 0, - ShardByCol = 1 -}; - - -// Default Blocking Strategy -template -class TensorContractionBlocking { - public: - - typedef typename LhsMapper::Scalar LhsScalar; - typedef typename RhsMapper::Scalar RhsScalar; - - EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : - kc_(k), mc_(m), nc_(n) - { - if (ShardingType == ShardByCol) { - computeProductBlockingSizes(kc_, mc_, nc_, num_threads); - } - else { - computeProductBlockingSizes(kc_, nc_, mc_, num_threads); - } - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index kc() const { return kc_; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index mc() const { return mc_; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index nc() const { return nc_; } - - private: - Index kc_; - Index mc_; - Index nc_; -}; - - -} // end namespace internal -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h deleted file mode 100644 index d65dbb4..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ /dev/null @@ -1,1391 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014-2015 Benoit Steiner -// Copyright (C) 2015 Navdeep Jaitly -// Copyright (C) 2014 Eric Martin -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H - -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) - -namespace Eigen { - -template -__device__ EIGEN_STRONG_INLINE void -EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem, - const Index m_size, const Index n_size, const Index k_size) { - - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; - - const Index base_m = 64 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - // declare and initialize 64 registers for output 8x8 block - - // prefetch registers - Scalar lhs_pf0; - Scalar lhs_pf1; - Scalar lhs_pf2; - Scalar lhs_pf3; - Scalar lhs_pf4; - Scalar lhs_pf5; - Scalar lhs_pf6; - Scalar lhs_pf7; - - Scalar rhs_pf0; - Scalar rhs_pf1; - Scalar rhs_pf2; - Scalar rhs_pf3; - Scalar rhs_pf4; - Scalar rhs_pf5; - Scalar rhs_pf6; - Scalar rhs_pf7; - - // shared memory is formatted - // (contract idx in block, nocontract idx in block, block idx) - // where block idx is column major. This transposition limits the number of - // bank conflicts when reading the LHS. The core idea is that since the contracting - // index is shared by both sides, then the contracting index should be in threadIdx.x. - - // On the LHS, we pad each row inside of each block with an extra element. This makes - // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts - // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks. - - // On the RHS we just add 8 padding elements to the end of each block. This gives no bank - // conflicts on writes and also none on reads. - - // storage indices - const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z; - const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x; - - const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0; - const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1; - const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2; - const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3; - const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4; - const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5; - const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6; - const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7; - - const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0; - const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1; - const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2; - const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3; - const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4; - const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5; - const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6; - const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7; - - // in the loading code, the following variables are important: - // threadIdx.x: the vertical position in an 8x8 block - // threadIdx.y: the vertical index of the 8x8 block in the grid - // threadIdx.z: the horizontal position in an 8x8 block - // k: the horizontal index of the 8x8 block in the grid - // - // The k parameter is implicit (it was the loop counter for a loop that went - // from 0 to <8, but now that loop is unrolled in the below code. - - const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y; - const Index lhs_vert = base_m + load_idx_vert; - -#define prefetchIntoRegisters(base_k) \ - { \ - lhs_pf0 = conv(0); \ - lhs_pf1 = conv(0); \ - lhs_pf2 = conv(0); \ - lhs_pf3 = conv(0); \ - lhs_pf4 = conv(0); \ - lhs_pf5 = conv(0); \ - lhs_pf6 = conv(0); \ - lhs_pf7 = conv(0); \ - \ - rhs_pf0 = conv(0); \ - rhs_pf1 = conv(0); \ - rhs_pf2 = conv(0); \ - rhs_pf3 = conv(0); \ - rhs_pf4 = conv(0); \ - rhs_pf5 = conv(0); \ - rhs_pf6 = conv(0); \ - rhs_pf7 = conv(0); \ - \ - if (!needs_edge_check || lhs_vert < m_size) { \ - const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \ - const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8; \ - const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8; \ - const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8; \ - const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8; \ - const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8; \ - const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8; \ - const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8; \ - \ - if (!needs_edge_check || lhs_horiz_7 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ - lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ - lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ - lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \ - } else if (lhs_horiz_6 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ - lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ - lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ - } else if (lhs_horiz_5 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ - lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ - } else if (lhs_horiz_4 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ - } else if (lhs_horiz_3 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - } else if (lhs_horiz_2 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - } else if (lhs_horiz_1 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - } else if (lhs_horiz_0 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - } \ - } \ - \ - const Index rhs_vert = base_k + load_idx_vert; \ - if (!needs_edge_check || rhs_vert < k_size) { \ - const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8; \ - const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8; \ - const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8; \ - const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8; \ - const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8; \ - const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8; \ - const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8; \ - const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8; \ - \ - if (rhs_horiz_7 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ - rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ - rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ - rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \ - } else if (rhs_horiz_6 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ - rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ - rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ - } else if (rhs_horiz_5 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ - rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ - } else if (rhs_horiz_4 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ - } else if (rhs_horiz_3 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - } else if (rhs_horiz_2 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - } else if (rhs_horiz_1 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - } else if (rhs_horiz_0 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - } \ - } \ - } \ - -#define writeRegToShmem(_) \ - lhs_shmem[lhs_store_idx_0] = lhs_pf0; \ - rhs_shmem[rhs_store_idx_0] = rhs_pf0; \ - \ - lhs_shmem[lhs_store_idx_1] = lhs_pf1; \ - rhs_shmem[rhs_store_idx_1] = rhs_pf1; \ - \ - lhs_shmem[lhs_store_idx_2] = lhs_pf2; \ - rhs_shmem[rhs_store_idx_2] = rhs_pf2; \ - \ - lhs_shmem[lhs_store_idx_3] = lhs_pf3; \ - rhs_shmem[rhs_store_idx_3] = rhs_pf3; \ - \ - lhs_shmem[lhs_store_idx_4] = lhs_pf4; \ - rhs_shmem[rhs_store_idx_4] = rhs_pf4; \ - \ - lhs_shmem[lhs_store_idx_5] = lhs_pf5; \ - rhs_shmem[rhs_store_idx_5] = rhs_pf5; \ - \ - lhs_shmem[lhs_store_idx_6] = lhs_pf6; \ - rhs_shmem[rhs_store_idx_6] = rhs_pf6; \ - \ - lhs_shmem[lhs_store_idx_7] = lhs_pf7; \ - rhs_shmem[rhs_store_idx_7] = rhs_pf7; \ - - // declare and initialize result array -#define res(i, j) _res_##i##j -#define initResultRow(i) \ - Scalar res(i, 0) = conv(0); \ - Scalar res(i, 1) = conv(0); \ - Scalar res(i, 2) = conv(0); \ - Scalar res(i, 3) = conv(0); \ - Scalar res(i, 4) = conv(0); \ - Scalar res(i, 5) = conv(0); \ - Scalar res(i, 6) = conv(0); \ - Scalar res(i, 7) = conv(0); \ - - internal::scalar_cast_op conv; - initResultRow(0); - initResultRow(1); - initResultRow(2); - initResultRow(3); - initResultRow(4); - initResultRow(5); - initResultRow(6); - initResultRow(7); -#undef initResultRow - - for (Index base_k = 0; base_k < k_size; base_k += 64) { - // wait for previous iteration to finish with shmem. Despite common sense, - // the code is a bit faster with this here then at bottom of loop - __syncthreads(); - - prefetchIntoRegisters(base_k); - writeRegToShmem(); - - #undef prefetchIntoRegisters - #undef writeRegToShmem - - // wait for shared mem packing to be done before starting computation - __syncthreads(); - - // compute 8x8 matrix product by outer product. This involves packing one column - // of LHS and one row of RHS into registers (takes 16 registers). - -#define lcol(i) _lcol##i - Scalar lcol(0); - Scalar lcol(1); - Scalar lcol(2); - Scalar lcol(3); - Scalar lcol(4); - Scalar lcol(5); - Scalar lcol(6); - Scalar lcol(7); - -#define rrow(j) _rrow##j - Scalar rrow(0); - Scalar rrow(1); - Scalar rrow(2); - Scalar rrow(3); - Scalar rrow(4); - Scalar rrow(5); - Scalar rrow(6); - Scalar rrow(7); - - // Now x corresponds to k, y to m, and z to n - const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y]; - const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z]; - -#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))] -#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))] - -#define loadData(i, j) \ - lcol(0) = lhs_element(0, j); \ - rrow(0) = rhs_element(i, 0); \ - lcol(1) = lhs_element(1, j); \ - rrow(1) = rhs_element(i, 1); \ - lcol(2) = lhs_element(2, j); \ - rrow(2) = rhs_element(i, 2); \ - lcol(3) = lhs_element(3, j); \ - rrow(3) = rhs_element(i, 3); \ - lcol(4) = lhs_element(4, j); \ - rrow(4) = rhs_element(i, 4); \ - lcol(5) = lhs_element(5, j); \ - rrow(5) = rhs_element(i, 5); \ - lcol(6) = lhs_element(6, j); \ - rrow(6) = rhs_element(i, 6); \ - lcol(7) = lhs_element(7, j); \ - rrow(7) = rhs_element(i, 7); \ - -#define computeCol(j) \ - res(0, j) += lcol(0) * rrow(j); \ - res(1, j) += lcol(1) * rrow(j); \ - res(2, j) += lcol(2) * rrow(j); \ - res(3, j) += lcol(3) * rrow(j); \ - res(4, j) += lcol(4) * rrow(j); \ - res(5, j) += lcol(5) * rrow(j); \ - res(6, j) += lcol(6) * rrow(j); \ - res(7, j) += lcol(7) * rrow(j); \ - -#define computePass(i) \ - loadData(i, i); \ - \ - computeCol(0); \ - computeCol(1); \ - computeCol(2); \ - computeCol(3); \ - computeCol(4); \ - computeCol(5); \ - computeCol(6); \ - computeCol(7); \ - - computePass(0); - computePass(1); - computePass(2); - computePass(3); - computePass(4); - computePass(5); - computePass(6); - computePass(7); - -#undef lcol -#undef rrow -#undef lhs_element -#undef rhs_element -#undef loadData -#undef computeCol -#undef computePass - } // end loop over k - - // we've now iterated over all of the large (ie width 64) k blocks and - // accumulated results in registers. At this point thread (x, y, z) contains - // the sum across all big k blocks of the product of little k block of index (x, y) - // with block of index (y, z). To compute the final output, we need to reduce - // the 8 threads over y by summation. -#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) - -#define reduceRow(i, mask) \ - shuffleInc(i, 0, mask); \ - shuffleInc(i, 1, mask); \ - shuffleInc(i, 2, mask); \ - shuffleInc(i, 3, mask); \ - shuffleInc(i, 4, mask); \ - shuffleInc(i, 5, mask); \ - shuffleInc(i, 6, mask); \ - shuffleInc(i, 7, mask); \ - -#define reduceMatrix(mask) \ - reduceRow(0, mask); \ - reduceRow(1, mask); \ - reduceRow(2, mask); \ - reduceRow(3, mask); \ - reduceRow(4, mask); \ - reduceRow(5, mask); \ - reduceRow(6, mask); \ - reduceRow(7, mask); \ - - // actually perform the reduction, now each thread of index (_, y, z) - // contains the correct values in its registers that belong in the output - // block - reduceMatrix(1); - reduceMatrix(2); - reduceMatrix(4); - -#undef shuffleInc -#undef reduceRow -#undef reduceMatrix - - // now we need to copy the 64 values into main memory. We can't split work - // among threads because all variables are in registers. There's 2 ways - // to do this: - // (1) have 1 thread do 64 writes from registers into global memory - // (2) have 1 thread do 64 writes into shared memory, and then 8 threads - // each do 8 writes into global memory. We can just overwrite the shared - // memory from the problem we just solved. - // (2) is slightly faster than (1) due to less branching and more ILP - - // TODO: won't yield much gain, but could just use currently unused shared mem - // and then we won't have to sync - // wait for shared mem to be out of use - __syncthreads(); - -#define writeResultShmem(i, j) \ - lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \ - -#define writeRow(i) \ - writeResultShmem(i, 0); \ - writeResultShmem(i, 1); \ - writeResultShmem(i, 2); \ - writeResultShmem(i, 3); \ - writeResultShmem(i, 4); \ - writeResultShmem(i, 5); \ - writeResultShmem(i, 6); \ - writeResultShmem(i, 7); \ - - if (threadIdx.x == 0) { - writeRow(0); - writeRow(1); - writeRow(2); - writeRow(3); - writeRow(4); - writeRow(5); - writeRow(6); - writeRow(7); - } -#undef writeResultShmem -#undef writeRow - - const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8); - const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8); - - if (threadIdx.x < max_i_write) { - if (max_j_write == 8) { - // TODO: can i trade bank conflicts for coalesced writes? - Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0]; - Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1]; - Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2]; - Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3]; - Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4]; - Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5]; - Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6]; - Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7]; - - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7; - } else { -#pragma unroll 7 - for (int j = 0; j < max_j_write; j++) { - Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j]; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val; - } - } - } -#undef res -} - - -template -__global__ void -__launch_bounds__(512) -EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, - const Index m_size, const Index n_size, const Index k_size) { - __shared__ Scalar lhs_shmem[72 * 64]; - __shared__ Scalar rhs_shmem[72 * 64]; - - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; - - const Index base_m = 64 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - if (base_m + 63 < m_size && base_n + 63 < n_size) { - EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); - } else { - EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); - } -} - - -template -__device__ EIGEN_STRONG_INLINE void -EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, float2 lhs_shmem2[][16], - float2 rhs_shmem2[][8], const Index m_size, - const Index n_size, const Index k_size, - const Index base_m, const Index base_n) { - typedef float Scalar; - - // prefetch registers - float4 lhs_pf0, rhs_pf0; - - float4 results[4]; - for (int i=0; i < 4; i++) { - results[i].x = results[i].y = results[i].z = results[i].w = 0; - } - - -#define prefetch_lhs(reg, row, col) \ - if (!CHECK_LHS_BOUNDARY) { \ - if (col < k_size) { \ - reg =lhs.loadPacket(row, col); \ - } \ - } else { \ - if (col < k_size) { \ - if (row + 3 < m_size) { \ - reg =lhs.loadPacket(row, col); \ - } else if (row + 2 < m_size) { \ - reg.x =lhs(row + 0, col); \ - reg.y =lhs(row + 1, col); \ - reg.z =lhs(row + 2, col); \ - } else if (row + 1 < m_size) { \ - reg.x =lhs(row + 0, col); \ - reg.y =lhs(row + 1, col); \ - } else if (row < m_size) { \ - reg.x =lhs(row + 0, col); \ - } \ - } \ - } \ - - - Index lhs_vert = base_m+threadIdx.x*4; - - for (Index k = 0; k < k_size; k += 16) { - lhs_pf0 = internal::pset1(0); - rhs_pf0 = internal::pset1(0); - - Index lhs_horiz = threadIdx.y+k; - prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz) - - Index rhs_vert = k+(threadIdx.x%4)*4; - Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n; - - if (!CHECK_RHS_BOUNDARY) { - if ((rhs_vert + 3) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); - } else if (rhs_vert + 2 < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - } else if (rhs_vert + 1 < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - } else if (rhs_vert < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - } - } else { - if (rhs_horiz0 < n_size) { - if ((rhs_vert + 3) < k_size) { - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); - } else if ((rhs_vert + 2) < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - } else if ((rhs_vert + 1) < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - } else if (rhs_vert < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - } - } - } - float x1, x2 ; - // the following can be a bitwise operation..... some day. - if((threadIdx.x%8) < 4) { - x1 = rhs_pf0.y; - x2 = rhs_pf0.w; - } else { - x1 = rhs_pf0.x; - x2 = rhs_pf0.z; - } - x1 = __shfl_xor(x1, 4); - x2 = __shfl_xor(x2, 4); - if((threadIdx.x%8) < 4) { - rhs_pf0.y = x1; - rhs_pf0.w = x2; - } else { - rhs_pf0.x = x1; - rhs_pf0.z = x2; - } - - // We have 64 features. - // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1. - // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3. - // ... - // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63 - // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1 - // ... - rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y); - rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w); - - // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) - // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) - // ... - // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) - // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) - // ... - - lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y); - lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w); - - -#define add_vals(fl1, fl2, fr1, fr2)\ - results[0].x += fl1.x * fr1.x;\ - results[0].y += fl1.y * fr1.x;\ - results[0].z += fl2.x * fr1.x;\ - results[0].w += fl2.y * fr1.x;\ -\ - results[1].x += fl1.x * fr1.y;\ - results[1].y += fl1.y * fr1.y;\ - results[1].z += fl2.x * fr1.y;\ - results[1].w += fl2.y * fr1.y;\ -\ - results[2].x += fl1.x * fr2.x;\ - results[2].y += fl1.y * fr2.x;\ - results[2].z += fl2.x * fr2.x;\ - results[2].w += fl2.y * fr2.x;\ -\ - results[3].x += fl1.x * fr2.y;\ - results[3].y += fl1.y * fr2.y;\ - results[3].z += fl2.x * fr2.y;\ - results[3].w += fl2.y * fr2.y;\ - - __syncthreads(); - - // Do the multiplies. - #pragma unroll - for (int koff = 0; koff < 16; koff ++) { - // 32 x threads. - float2 fl1 = lhs_shmem2[koff][threadIdx.x]; - float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x]; - - int start_feature = threadIdx.y * 4; - float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; - float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; - - add_vals(fl1, fl2, fr1, fr2) - } - __syncthreads(); - } - -#undef prefetch_lhs -#undef add_vals - - Index horiz_base = threadIdx.y*4+base_n; - if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } else if (!CHECK_RHS_BOUNDARY) { - // CHECK LHS - if (lhs_vert + 3 < m_size) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } else if (lhs_vert + 2 < m_size) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - } - } else if (lhs_vert + 1 < m_size) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - } - } else if (lhs_vert < m_size) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - } - } - } else if (!CHECK_LHS_BOUNDARY) { - // CHECK RHS - /* - int ncols_rem = fminf(n_size- horiz_base, 4); - for (int i = 0; i < ncols_rem; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - }*/ - for (int i = 0; i < 4; i++) { - if (horiz_base+i < n_size) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } - } else { - // CHECK both boundaries. - for (int i = 0; i < 4; i++) { - if (horiz_base+i < n_size) { - if (lhs_vert < m_size) - output(lhs_vert, horiz_base + i) = results[i].x; - if (lhs_vert + 1 < m_size) - output(lhs_vert + 1, horiz_base + i) = results[i].y; - if (lhs_vert + 2 < m_size) - output(lhs_vert + 2, horiz_base + i) = results[i].z; - if (lhs_vert + 3 < m_size) - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } - } -} - - -template -__device__ EIGEN_STRONG_INLINE void -EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, float2 lhs_shmem2[][32], - float2 rhs_shmem2[][8], const Index m_size, - const Index n_size, const Index k_size, - const Index base_m, const Index base_n) { - typedef float Scalar; - - // prefetch registers - float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3; - float4 rhs_pf0, rhs_pf1; - - float4 results[8]; - for (int i=0; i < 8; i++) { - results[i].x = results[i].y = results[i].z = results[i].w = 0; - } - - - Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32; - for (Index k = 0; k < k_size; k += 32) { - lhs_pf0 = internal::pset1(0); - lhs_pf1 = internal::pset1(0); - lhs_pf2 = internal::pset1(0); - lhs_pf3 = internal::pset1(0); - - rhs_pf0 = internal::pset1(0); - rhs_pf1 = internal::pset1(0); - - if (!CHECK_LHS_BOUNDARY) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - } - } else { - // just CHECK_LHS_BOUNDARY - if (lhs_vert + 3 < m_size) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); - } - } else if (lhs_vert + 2 < m_size) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); - lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); - lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); - lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); - lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); - lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); - } - } else if (lhs_vert + 1 < m_size) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); - lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); - lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - } - } else if (lhs_vert < m_size) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - } - } - } - __syncthreads(); - Index rhs_vert = k+threadIdx.x*4; - Index rhs_horiz0 = threadIdx.y*2+base_n; - Index rhs_horiz1 = threadIdx.y*2+1+base_n; - if (!CHECK_RHS_BOUNDARY) { - if ((rhs_vert + 3) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); - rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1); - } else if (rhs_vert + 2 < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); - } else if (rhs_vert + 1 < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - } else if (rhs_vert < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - } - } else { - if (rhs_horiz1 < n_size) { - if ((rhs_vert + 3) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); - rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1); - } else if (rhs_vert + 2 < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); - } else if (k+threadIdx.x*4 + 1 < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - } else if (k+threadIdx.x*4 < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - } - } else if (rhs_horiz0 < n_size) { - if ((rhs_vert + 3) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); - } else if ((rhs_vert + 2) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - } else if ((rhs_vert + 1) < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - } else if (rhs_vert < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - } - } - } - __syncthreads(); - // Loaded. Do computation - // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1. - // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3. - // .. - // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63 - rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x); - // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1. - // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3. - // .. - rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y); - // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1. - // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3. - rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z); - // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1. - // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3. - rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w); - - // LHS. - // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) - // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) - // ... - // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) - // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) - - -#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\ - results[0].x += a_feat1.x * f1.x;\ - results[1].x += a_feat1.x * f1.y;\ - results[2].x += a_feat1.x * f2.x;\ - results[3].x += a_feat1.x * f2.y;\ - results[4].x += a_feat1.x * f3.x;\ - results[5].x += a_feat1.x * f3.y;\ - results[6].x += a_feat1.x * f4.x;\ - results[7].x += a_feat1.x * f4.y;\ -\ - results[0].y += a_feat1.y * f1.x;\ - results[1].y += a_feat1.y * f1.y;\ - results[2].y += a_feat1.y * f2.x;\ - results[3].y += a_feat1.y * f2.y;\ - results[4].y += a_feat1.y * f3.x;\ - results[5].y += a_feat1.y * f3.y;\ - results[6].y += a_feat1.y * f4.x;\ - results[7].y += a_feat1.y * f4.y;\ -\ - results[0].z += a_feat2.x * f1.x;\ - results[1].z += a_feat2.x * f1.y;\ - results[2].z += a_feat2.x * f2.x;\ - results[3].z += a_feat2.x * f2.y;\ - results[4].z += a_feat2.x * f3.x;\ - results[5].z += a_feat2.x * f3.y;\ - results[6].z += a_feat2.x * f4.x;\ - results[7].z += a_feat2.x * f4.y;\ -\ - results[0].w += a_feat2.y * f1.x;\ - results[1].w += a_feat2.y * f1.y;\ - results[2].w += a_feat2.y * f2.x;\ - results[3].w += a_feat2.y * f2.y;\ - results[4].w += a_feat2.y * f3.x;\ - results[5].w += a_feat2.y * f3.y;\ - results[6].w += a_feat2.y * f4.x;\ - results[7].w += a_feat2.y * f4.y;\ - - lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y); - lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y); - lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y); - lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y); - - lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w); - lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w); - lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w); - lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w); - - __syncthreads(); - - // Do the multiplies. - #pragma unroll - for (int koff = 0; koff < 32; koff ++) { - float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8]; - float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8]; - - // first feature is at (threadIdx.y/4) * 8 last is at start + 8. - int start_feature = (threadIdx.y / 4) * 8; - - float2 br1 = rhs_shmem2[start_feature/2 + (koff % 4) * 32][koff/4]; - float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4]; - float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4]; - float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4]; - - add_vals(a3, a4, br1, br2, br3, br4) - } - __syncthreads(); - } // end loop over k - - - __syncthreads(); - Index horiz_base = (threadIdx.y/4)*8+base_n; - if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } else if (!CHECK_RHS_BOUNDARY) { - if (lhs_vert + 3 < m_size) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } else if (lhs_vert + 2 < m_size) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - } - } else if (lhs_vert + 1 < m_size) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - } - } else if (lhs_vert < m_size) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - } - } - } else if (!CHECK_LHS_BOUNDARY) { - // CHECK BOUNDARY_B - for (int i = 0; i < 8; i++) { - if (horiz_base + i < n_size) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } - } else { - // CHECK both boundaries. - for (int i = 0; i < 8; i++) { - if (horiz_base + i < n_size) { - if (lhs_vert < m_size) - output(lhs_vert, horiz_base + i) = results[i].x; - if (lhs_vert + 1 < m_size) - output(lhs_vert + 1, horiz_base + i) = results[i].y; - if (lhs_vert + 2 < m_size) - output(lhs_vert + 2, horiz_base + i) = results[i].z; - if (lhs_vert + 3 < m_size) - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } - } -} - - -template -__global__ void -__launch_bounds__(256) -EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, - const Index m_size, const Index n_size, const Index k_size) { - __shared__ float2 lhs_shmem[64*32]; - __shared__ float2 rhs_shmem[128*8]; - - typedef float2 LHS_MEM[64][32]; - typedef float2 RHS_MEM[128][8]; - - typedef float2 LHS_MEM16x16[32][16]; - typedef float2 RHS_MEM16x16[64][8]; - - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; - - const Index base_m = 128 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - bool check_rhs = (base_n + 63) >= n_size; - bool check_lhs128 = (base_m + 127) >= m_size; - - if (!check_rhs) { - if (!check_lhs128) { - // >= 128 rows left - EigenFloatContractionKernelInternal( - lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); - } else { - EigenFloatContractionKernelInternal( - lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); - } - } else { - if (!check_lhs128) { - // >= 128 rows left - EigenFloatContractionKernelInternal( - lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); - } else { - EigenFloatContractionKernelInternal( - lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); - } - } -} - -template -__global__ void -__launch_bounds__(256) -EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, - const Index m_size, const Index n_size, const Index k_size) { - __shared__ float2 lhs_shmem[32][16]; - __shared__ float2 rhs_shmem[64][8]; - - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; - - const Index base_m = 64 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - if (base_m + 63 < m_size) { - if (base_n + 63 < n_size) { - EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); - } else { - EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); - } - } else { - if (base_n + 63 < n_size) { - EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); - } else { - EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); - } - } -} - - -template -struct TensorEvaluator, GpuDevice> : - public TensorContractionEvaluatorBase, GpuDevice> > { - - typedef GpuDevice Device; - - typedef TensorEvaluator, Device> Self; - typedef TensorContractionEvaluatorBase Base; - - typedef TensorContractionOp XprType; - typedef typename internal::remove_const::type Scalar; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - - enum { - Layout = TensorEvaluator::Layout, - }; - - // Most of the code is assuming that both input tensors are ColMajor. If the - // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: - // If we want to compute A * B = C, where A is LHS and B is RHS, the code - // will pretend B is LHS and A is RHS. - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - - static const int LDims = - internal::array_size::Dimensions>::value; - static const int RDims = - internal::array_size::Dimensions>::value; - static const int ContractDims = internal::array_size::value; - - typedef array left_dim_mapper_t; - typedef array right_dim_mapper_t; - - typedef array contract_t; - typedef array left_nocontract_t; - typedef array right_nocontract_t; - - static const int NumDims = LDims + RDims - 2 * ContractDims; - - typedef DSizes Dimensions; - - // typedefs needed in evalTo - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; - - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; - - typedef typename LeftEvaluator::Dimensions LeftDimensions; - typedef typename RightEvaluator::Dimensions RightDimensions; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : - Base(op, device) {} - - // We need to redefine this method to make nvcc happy - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { - this->m_leftImpl.evalSubExprsIfNeeded(NULL); - this->m_rightImpl.evalSubExprsIfNeeded(NULL); - if (data) { - evalTo(data); - return false; - } else { - this->m_result = static_cast(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); - evalTo(this->m_result); - return true; - } - } - - void evalTo(Scalar* buffer) const { - if (this->m_lhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - } - else { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - } - } - - template struct LaunchKernels { - static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { - const Index m_blocks = (m + 63) / 64; - const Index n_blocks = (n + 63) / 64; - const dim3 num_blocks(m_blocks, n_blocks, 1); - const dim3 block_size(8, 8, 8); - LAUNCH_CUDA_KERNEL((EigenContractionKernel), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); - } - }; - - template struct LaunchKernels { - static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { - if (m < 768 || n < 768) { - const Index m_blocks = (m + 63) / 64; - const Index n_blocks = (n + 63) / 64; - const dim3 num_blocks(m_blocks, n_blocks, 1); - const dim3 block_size(16, 16, 1); - LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); - } else { - const Index m_blocks = (m + 127) / 128; - const Index n_blocks = (n + 63) / 64; - const dim3 num_blocks(m_blocks, n_blocks, 1); - const dim3 block_size(8, 32, 1); - LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); - } - } - }; - - template - void evalTyped(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; - EIGEN_UNUSED_VARIABLE(k) - - // rows in left side - const Index m = this->m_i_size; - - // columns in right side - const Index n = this->m_j_size; - - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - - typedef internal::TensorContractionInputMapper LhsMapper; - - typedef internal::TensorContractionInputMapper RhsMapper; - - typedef internal::blas_data_mapper OutputMapper; - - - // initialize data mappers - LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, - this->m_left_contracting_strides, this->m_k_strides); - - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, - this->m_right_contracting_strides, this->m_k_strides); - - OutputMapper output(buffer, m); - - setCudaSharedMemConfig(cudaSharedMemBankSizeEightByte); - LaunchKernels::Run(lhs, rhs, output, m, n, k, this->m_device); - } -}; - -} // end namespace Eigen - -#endif // EIGEN_USE_GPU and __CUDACC__ -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h deleted file mode 100644 index 9b2cb3f..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ /dev/null @@ -1,467 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H - -namespace Eigen { - -namespace internal { - -enum { - Rhs = 0, - Lhs = 1 -}; - -/* - * Implementation of the Eigen blas_data_mapper class for tensors. - */ - -template struct CoeffLoader { - enum { - DirectOffsets = false - }; - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_tensor(tensor) { } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index) { - eigen_assert(false && "unsupported"); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return m_tensor.coeff(index); } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename Tensor::PacketReturnType packet(typename Tensor::Index index) const - { - return m_tensor.template packet(index); - } - - - private: - const Tensor m_tensor; -}; - -template struct CoeffLoader { - enum { - DirectOffsets = true - }; - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_data(tensor.data()) {} - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { - m_data += offset; - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return loadConstant(m_data+index); } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename Tensor::PacketReturnType packet(typename Tensor::Index index) const - { - return internal::ploadt_ro(m_data + index); - } - private: - typedef typename Tensor::Scalar Scalar; - const Scalar* m_data; -}; - -template -class SimpleTensorContractionMapper { - public: - EIGEN_DEVICE_FUNC - SimpleTensorContractionMapper(const Tensor& tensor, - const nocontract_t& nocontract_strides, - const nocontract_t& ij_strides, - const contract_t& contract_strides, - const contract_t& k_strides) : - m_tensor(tensor), - m_nocontract_strides(nocontract_strides), - m_ij_strides(ij_strides), - m_contract_strides(contract_strides), - m_k_strides(k_strides) { } - - enum { - DirectOffsets = CoeffLoader::DirectOffsets - }; - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { - m_tensor.offsetBuffer(offset); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar operator()(Index row) const { - // column major assumption - return operator()(row, 0); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const { - return m_tensor.coeff(computeIndex(row, col)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const { - const bool left = (side == Lhs); - Index nocontract_val = left ? row : col; - Index linidx = 0; - for (int i = static_cast(array_size::value) - 1; i > 0; i--) { - const Index idx = nocontract_val / m_ij_strides[i]; - linidx += idx * m_nocontract_strides[i]; - nocontract_val -= idx * m_ij_strides[i]; - } - if (array_size::value > array_size::value) { - if (side == Lhs && inner_dim_contiguous) { - eigen_assert(m_nocontract_strides[0] == 1); - linidx += nocontract_val; - } else { - linidx += nocontract_val * m_nocontract_strides[0]; - } - } - - Index contract_val = left ? col : row; - if(array_size::value > 0) { - for (int i = static_cast(array_size::value) - 1; i > 0; i--) { - const Index idx = contract_val / m_k_strides[i]; - linidx += idx * m_contract_strides[i]; - contract_val -= idx * m_k_strides[i]; - } - - if (side == Rhs && inner_dim_contiguous) { - eigen_assert(m_contract_strides[0] == 1); - linidx += contract_val; - } else { - linidx += contract_val * m_contract_strides[0]; - } - } - - return linidx; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE IndexPair computeIndexPair(Index row, Index col, const Index distance) const { - const bool left = (side == Lhs); - Index nocontract_val[2] = {left ? row : col, left ? row + distance : col}; - Index linidx[2] = {0, 0}; - if (array_size::value > array_size::value) { - for (int i = static_cast(array_size::value) - 1; i > 0; i--) { - const Index idx0 = nocontract_val[0] / m_ij_strides[i]; - const Index idx1 = nocontract_val[1] / m_ij_strides[i]; - linidx[0] += idx0 * m_nocontract_strides[i]; - linidx[1] += idx1 * m_nocontract_strides[i]; - nocontract_val[0] -= idx0 * m_ij_strides[i]; - nocontract_val[1] -= idx1 * m_ij_strides[i]; - } - if (side == Lhs && inner_dim_contiguous) { - eigen_assert(m_nocontract_strides[0] == 1); - linidx[0] += nocontract_val[0]; - linidx[1] += nocontract_val[1]; - } else { - linidx[0] += nocontract_val[0] * m_nocontract_strides[0]; - linidx[1] += nocontract_val[1] * m_nocontract_strides[0]; - } - } - - Index contract_val[2] = {left ? col : row, left ? col : row + distance}; - if (array_size::value> 0) { - for (int i = static_cast(array_size::value) - 1; i > 0; i--) { - const Index idx0 = contract_val[0] / m_k_strides[i]; - const Index idx1 = contract_val[1] / m_k_strides[i]; - linidx[0] += idx0 * m_contract_strides[i]; - linidx[1] += idx1 * m_contract_strides[i]; - contract_val[0] -= idx0 * m_k_strides[i]; - contract_val[1] -= idx1 * m_k_strides[i]; - } - - if (side == Rhs && inner_dim_contiguous) { - eigen_assert(m_contract_strides[0] == 1); - linidx[0] += contract_val[0]; - linidx[1] += contract_val[1]; - } else { - linidx[0] += contract_val[0] * m_contract_strides[0]; - linidx[1] += contract_val[1] * m_contract_strides[0]; - } - } - return IndexPair(linidx[0], linidx[1]); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const { - // Only claim alignment when we can compute the actual stride (ie when we're - // dealing with the lhs with inner_dim_contiguous. This is because the - // matrix-vector product relies on the stride when dealing with aligned inputs. - return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size; - } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const { - return ((side == Lhs) && inner_dim_contiguous && array_size::value > 0) ? m_contract_strides[0] : 1; - } - - protected: - CoeffLoader m_tensor; - const nocontract_t m_nocontract_strides; - const nocontract_t m_ij_strides; - const contract_t m_contract_strides; - const contract_t m_k_strides; -}; - - -template -class BaseTensorContractionMapper : public SimpleTensorContractionMapper -{ - public: - typedef SimpleTensorContractionMapper ParentMapper; - - EIGEN_DEVICE_FUNC - BaseTensorContractionMapper(const Tensor& tensor, - const nocontract_t& nocontract_strides, - const nocontract_t& ij_strides, - const contract_t& contract_strides, - const contract_t& k_strides) : - ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } - - typedef typename Tensor::PacketReturnType Packet; - typedef typename unpacket_traits::half HalfPacket; - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { - // whole method makes column major assumption - - // don't need to add offsets for now (because operator handles that) - // current code assumes packet size must be a multiple of 2 - EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - - if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) { - const Index index = this->computeIndex(i, j); - eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1); - return this->m_tensor.template packet(index); - } - - const IndexPair indexPair = this->computeIndexPair(i, j, packet_size - 1); - const Index first = indexPair.first; - const Index last = indexPair.second; - - // We can always do optimized packet reads from left hand side right now, because - // the vertical matrix dimension on the left hand side is never contracting. - // On the right hand side we need to check if the contracting dimensions may have - // been shuffled first. - if (Tensor::PacketAccess && - (side == Lhs || internal::array_size::value <= 1 || !inner_dim_reordered) && - (last - first) == (packet_size - 1)) { - - return this->m_tensor.template packet(first); - } - - EIGEN_ALIGN_MAX Scalar data[packet_size]; - - data[0] = this->m_tensor.coeff(first); - for (Index k = 1; k < packet_size - 1; k += 2) { - const IndexPair internal_pair = this->computeIndexPair(i + k, j, 1); - data[k] = this->m_tensor.coeff(internal_pair.first); - data[k + 1] = this->m_tensor.coeff(internal_pair.second); - } - data[packet_size - 1] = this->m_tensor.coeff(last); - - return pload(data); - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { - // whole method makes column major assumption - - // don't need to add offsets for now (because operator handles that) - const Index half_packet_size = unpacket_traits::size; - if (half_packet_size == packet_size) { - return loadPacket(i, j); - } - EIGEN_ALIGN_MAX Scalar data[half_packet_size]; - for (Index k = 0; k < half_packet_size; k++) { - data[k] = operator()(i + k, j); - } - return pload(data); - } -}; - - -template -class BaseTensorContractionMapper : public SimpleTensorContractionMapper -{ - public: - typedef SimpleTensorContractionMapper ParentMapper; - - EIGEN_DEVICE_FUNC - BaseTensorContractionMapper(const Tensor& tensor, - const nocontract_t& nocontract_strides, - const nocontract_t& ij_strides, - const contract_t& contract_strides, - const contract_t& k_strides) : - ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } - - typedef typename Tensor::PacketReturnType Packet; - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { - EIGEN_ALIGN_MAX Scalar data[1]; - data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); - return pload(data); - } - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const { - return loadPacket(i, j); - } -}; - - -template -class TensorContractionSubMapper { - public: - typedef typename Tensor::PacketReturnType Packet; - typedef typename unpacket_traits::half HalfPacket; - - typedef BaseTensorContractionMapper ParentMapper; - typedef TensorContractionSubMapper Self; - typedef Self LinearMapper; - - enum { - // We can use direct offsets iff the parent mapper supports then and we can compute the strides. - // TODO: we should also enable direct offsets for the Rhs case. - UseDirectOffsets = ParentMapper::DirectOffsets && (side == Lhs) && inner_dim_contiguous && (array_size::value > 0) - }; - - EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset) - : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { - // Bake the offsets into the buffer used by the base mapper whenever possible. This avoids the need to recompute - // this offset every time we attempt to access a coefficient. - if (UseDirectOffsets) { - Index stride = m_base_mapper.stride(); - m_base_mapper.offsetBuffer(vert_offset + horiz_offset * stride); - } - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { - if (UseDirectOffsets) { - return m_base_mapper(i, 0); - } - return m_base_mapper(i + m_vert_offset, m_horiz_offset); - } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const { - if (UseDirectOffsets) { - return m_base_mapper(i, j); - } - return m_base_mapper(i + m_vert_offset, j + m_horiz_offset); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { - if (UseDirectOffsets) { - return m_base_mapper.template loadPacket(i, 0); - } - return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); - } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { - if (UseDirectOffsets) { - return m_base_mapper.template loadPacket(i, j); - } - return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { - if (UseDirectOffsets) { - return m_base_mapper.template loadHalfPacket(i, 0); - } - return m_base_mapper.template loadHalfPacket(i + m_vert_offset, m_horiz_offset); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { - if (UseDirectOffsets) { - m_base_mapper.storePacket(i, 0, p); - } - m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { - if (UseDirectOffsets) { - return LinearMapper(m_base_mapper, i, j); - } - return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset); - } - - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const { - EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); - const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned; - if (UseDirectOffsets) { - return m_base_mapper.template loadPacket(i, 0); - } - return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); - } - - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const { - return false; - } - - private: - ParentMapper m_base_mapper; - const Index m_vert_offset; - const Index m_horiz_offset; -}; - - -template -class TensorContractionInputMapper - : public BaseTensorContractionMapper { - - public: - typedef Scalar_ Scalar; - typedef BaseTensorContractionMapper Base; - typedef TensorContractionSubMapper SubMapper; - typedef SubMapper VectorMapper; - - EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor, - const nocontract_t& nocontract_strides, - const nocontract_t& ij_strides, - const contract_t& contract_strides, - const contract_t& k_strides) - : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const { - return SubMapper(*this, i, j); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { - return VectorMapper(*this, i, j); - } -}; - - - -} // end namespace internal -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h deleted file mode 100644 index c70dea0..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ /dev/null @@ -1,1043 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H - -// evaluator for thread pool device -#ifdef EIGEN_USE_THREADS - -namespace Eigen { - -#ifdef EIGEN_USE_SIMPLE_THREAD_POOL -namespace internal { - -template -struct packLhsArg { - LhsScalar* blockA; - const LhsMapper& lhs; - const Index m_start; - const Index k_start; - const Index mc; - const Index kc; -}; - -template -struct packRhsAndKernelArg { - const MaxSizeVector* blockAs; - RhsScalar* blockB; - const RhsMapper& rhs; - OutputMapper& output; - const Index m; - const Index k; - const Index n; - const Index mc; - const Index kc; - const Index nc; - const Index num_threads; - const Index num_blockAs; - const Index max_m; - const Index k_block_idx; - const Index m_block_idx; - const Index n_block_idx; - const Index m_blocks; - const Index n_blocks; - MaxSizeVector* kernel_notifications; - const MaxSizeVector* lhs_notifications; - const bool need_to_pack; -}; - -} // end namespace internal -#endif // EIGEN_USE_SIMPLE_THREAD_POOL - -template -struct TensorEvaluator, ThreadPoolDevice> : - public TensorContractionEvaluatorBase, ThreadPoolDevice> > { - - typedef ThreadPoolDevice Device; - - typedef TensorEvaluator, Device> Self; - typedef TensorContractionEvaluatorBase Base; - - typedef TensorContractionOp XprType; - typedef typename internal::remove_const::type Scalar; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - - enum { - Layout = TensorEvaluator::Layout, - }; - - // Most of the code is assuming that both input tensors are ColMajor. If the - // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: - // If we want to compute A * B = C, where A is LHS and B is RHS, the code - // will pretend B is LHS and A is RHS. - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - - static const int LDims = - internal::array_size::Dimensions>::value; - static const int RDims = - internal::array_size::Dimensions>::value; - static const int ContractDims = internal::array_size::value; - - typedef array left_dim_mapper_t; - typedef array right_dim_mapper_t; - - typedef array contract_t; - typedef array left_nocontract_t; - typedef array right_nocontract_t; - - static const int NumDims = LDims + RDims - 2 * ContractDims; - - typedef DSizes Dimensions; - - // typedefs needed in evalTo - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; - typedef typename internal::gebp_traits Traits; - - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; - - TensorEvaluator(const XprType& op, const Device& device) : - Base(op, device) {} - -#ifndef EIGEN_USE_SIMPLE_THREAD_POOL - template - void evalProduct(Scalar* buffer) const { - typedef internal::TensorContractionInputMapper< - LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t, - contract_t, internal::packet_traits::size, - lhs_inner_dim_contiguous, false, Unaligned> - LhsMapper; - typedef internal::TensorContractionInputMapper< - RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t, - contract_t, internal::packet_traits::size, - rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned> - RhsMapper; - typedef internal::blas_data_mapper OutputMapper; - typedef internal::gemm_pack_lhs - LhsPacker; - typedef internal::gemm_pack_rhs< - RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> - RhsPacker; - typedef internal::gebp_kernel - GebpKernel; - - const Index m = this->m_i_size; - const Index n = this->m_j_size; - const Index k = this->m_k_size; - if (m == 0 || n == 0 || k == 0) return; - - // Compute a set of algorithm parameters: - // - kernel block sizes (bm, bn, bk) - // - task grain sizes (number of kernels executed per task: gm, gn) - // - number of threads - // - sharding by row/column - // - parallel packing or first lhs then rhs - // and some derived parameters: - // - number of tasks (nm, nn, nk) - // - number of kernels (nm0, nn0) - // Unfortunately, all these parameters are tightly interdependent. - // So in some cases we first compute approximate values, then compute other - // values based on these approximations and then refine the approximations. - - // There are lots of heuristics here. There is some reasoning behind them, - // but ultimately they are just tuned on contraction benchmarks for - // different input configurations, thread counts and instruction sets. - // So feel free to question any of them. - - // Compute whether we want to shard by row or by column. - // This is a first approximation, it will be refined later. Since we don't - // know number of threads yet we use 2, because what's we are most - // interested in at this point is whether it makes sense to use - // parallelization at all or not. - bool shard_by_col = shardByCol(m, n, 2); - - // First approximation of kernel blocking sizes. - // Again, we don't know number of threads yet, so we use 2. - Index bm, bn, bk; - if (shard_by_col) { - internal::TensorContractionBlocking - blocking(k, m, n, 2); - bm = blocking.mc(); - bn = blocking.nc(); - bk = blocking.kc(); - } else { - internal::TensorContractionBlocking - blocking(k, m, n, 2); - bm = blocking.mc(); - bn = blocking.nc(); - bk = blocking.kc(); - } - - // Compute optimal number of threads. - // Note: we use bk instead of k here because we are interested in amount of - // _parallelizable_ computations, and computations are not parallelizable - // across k dimension. - const TensorOpCost cost = - contractionCost(m, n, bm, bn, bk, shard_by_col, false); - int num_threads = TensorCostModel::numThreads( - static_cast(n) * m, cost, this->m_device.numThreads()); - - // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost - // model is not tuned. Remove this when the cost model is tuned. - if (n == 1) num_threads = 1; - - if (num_threads == 1) { - // The single-threaded algorithm should be faster in this case. - if (n == 1) - this->template evalGemv(buffer); - else - this->template evalGemm(buffer); - return; - } - - // Now that we know number of threads, recalculate sharding and blocking. - shard_by_col = shardByCol(m, n, num_threads); - if (shard_by_col) { - internal::TensorContractionBlocking - blocking(k, m, n, num_threads); - bm = blocking.mc(); - bn = blocking.nc(); - bk = blocking.kc(); - } else { - internal::TensorContractionBlocking - blocking(k, m, n, num_threads); - bm = blocking.mc(); - bn = blocking.nc(); - bk = blocking.kc(); - } - - // Number of kernels for each dimension. - Index nm0 = divup(m, bm); - Index nn0 = divup(n, bn); - Index nk = divup(k, bk); - - // Calculate task grain size (number of kernels executed per task). - // This task size coarsening serves two purposes: - // 1. It reduces per-task overheads including synchronization overheads. - // 2. It allows to use caches better (reuse the same packed rhs in several - // consecutive kernels). - Index gm = 1; - Index gn = 1; - // If we are sharding by column, then we prefer to reduce rows first. - if (shard_by_col) { - gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col); - gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col); - } else { - gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col); - gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col); - } - // Number of tasks in each dimension. - Index nm = divup(nm0, gm); - Index nn = divup(nn0, gn); - - // Last by not least, decide whether we want to issue both lhs and rhs - // packing in parallel; or issue lhs packing first, and then issue rhs - // packing when lhs packing completes (for !shard_by_col lhs and rhs are - // swapped). Parallel packing allows more parallelism (for both packing and - // kernels), while sequential packing provides better locality (once - // a thread finishes rhs packing it proceed to kernels with that rhs). - // First, we are interested in parallel packing if there are few tasks. - bool parallel_pack = num_threads >= nm * nn; - // Also do parallel packing if all data fits into L2$. - if (m * bk * Index(sizeof(LhsScalar)) + n * bk * Index(sizeof(RhsScalar)) <= - l2CacheSize() * num_threads) - parallel_pack = true; - // But don't do it if we will use each rhs only once. Locality seems to be - // more important in this case. - if ((shard_by_col ? nm : nn) == 1) parallel_pack = false; - - LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, - this->m_i_strides, this->m_left_contracting_strides, - this->m_k_strides); - - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, - this->m_j_strides, this->m_right_contracting_strides, - this->m_k_strides); - - Context(this->m_device, num_threads, lhs, rhs, buffer, m, n, - k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0, - shard_by_col, parallel_pack) - .run(); - } - - // Context coordinates a single parallel gemm operation. - template - class Context { - public: - Context(const Device& device, int num_threads, LhsMapper& lhs, - RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm, - Index bn, Index bk, Index nm, Index nn, Index nk, Index gm, - Index gn, Index nm0, Index nn0, bool shard_by_col, - bool parallel_pack) - : device_(device), - lhs_(lhs), - rhs_(rhs), - buffer_(buffer), - output_(buffer, tm), - num_threads_(num_threads), - shard_by_col_(shard_by_col), - parallel_pack_(parallel_pack), - m_(tm), - n_(tn), - k_(tk), - bm_(bm), - bn_(bn), - bk_(bk), - nm_(nm), - nn_(nn), - nk_(nk), - gm_(gm), - gn_(gn), - nm0_(nm0), - nn0_(nn0) - { - for (Index x = 0; x < P; x++) { - // Normal number of notifications for k slice switch is - // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only - // nm_ + nn_ notifications, because they will not receive notifications - // from preceeding kernels. - state_switch_[x] = - x == 0 - ? 1 - : (parallel_pack_ ? nn_ + nm_ : (shard_by_col_ ? nn_ : nm_)) + - (x == P - 1 ? nm_ * nn_ : 0); - state_packing_ready_[x] = - parallel_pack_ ? 0 : (shard_by_col_ ? nm_ : nn_); - state_kernel_[x] = new std::atomic*[nm_]; - for (Index m = 0; m < nm_; m++) { - state_kernel_[x][m] = new std::atomic[nn_]; - // Kernels generally receive 3 notifications (previous kernel + 2 - // packing), but the first slice won't get notifications from previous - // kernels. - for (Index n = 0; n < nn_; n++) - state_kernel_[x][m][n].store( - (x == 0 ? 0 : 1) + (parallel_pack_ ? 2 : 1), - std::memory_order_relaxed); - } - } - - // Allocate memory for packed rhs/lhs matrices. - size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); - size_t lhs_size = - divup(bm_ * bk_ * sizeof(LhsScalar), align) * align; - size_t rhs_size = - divup(bn_ * bk_ * sizeof(RhsScalar), align) * align; - packed_mem_ = static_cast(internal::aligned_malloc( - (nm0_ * lhs_size + nn0_ * rhs_size) * std::min(nk_, P - 1))); - char* mem = static_cast(packed_mem_); - for (Index x = 0; x < numext::mini(nk_, P - 1); x++) { - packed_lhs_[x].resize(nm0_); - for (Index m = 0; m < nm0_; m++) { - packed_lhs_[x][m] = reinterpret_cast(mem); - mem += lhs_size; - } - packed_rhs_[x].resize(nn0_); - for (Index n = 0; n < nn0_; n++) { - packed_rhs_[x][n] = reinterpret_cast(mem); - mem += rhs_size; - } - } - } - - ~Context() { - for (Index x = 0; x < P; x++) { - for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m]; - delete[] state_kernel_[x]; - } - internal::aligned_free(packed_mem_); - } - - void run() { - // Kick off packing of the first slice. - signal_switch(0, 1); - // Wait for overall completion. - // TODO(dvyukov): this wait can lead to deadlock. - // If nthreads contractions are concurrently submitted from worker - // threads, this wait will block all worker threads and the system will - // deadlock. - done_.Wait(); - } - - private: - Notification done_; - const Device& device_; - LhsMapper& lhs_; - RhsMapper& rhs_; - Scalar* const buffer_; - OutputMapper output_; - const int num_threads_; - const bool shard_by_col_; - const bool parallel_pack_; - // Matrix sizes. - const Index m_; - const Index n_; - const Index k_; - // Block sizes. - const Index bm_; - const Index bn_; - const Index bk_; - // Number of tasks. - const Index nm_; - const Index nn_; - const Index nk_; - // Task grain sizes (number of kernels executed per task). - const Index gm_; - const Index gn_; - // Number of blocks (this is different from ni_/nn_ because of task size - // coarsening). - const Index nm0_; - const Index nn0_; - - // Parallelization strategy. - // - // Blocks related to the same k block can run in parallel because they write - // to different output blocks. So we parallelize within k slices, this - // gives us parallelism level of m x n. Before we can start any kernels - // related to k-th slice, we need to issue m lhs packing tasks and n rhs - // packing tasks. - // - // However, there is a bottleneck when we are finishing kernels for k-th - // slice (at the very end there is only 1 runnable kernel). To mitigate this - // bottleneck we allow kernels from k-th and k+1-th slices to run in - // parallel. Note that (m, n, k) and (m, n, k+1) kernels write to the same - // output block, so they must not run in parallel. - // - // This gives us the following dependency graph. - // On each k slice we have m x n kernel tasks, m lhs paking tasks and n rhs - // packing tasks. - // Kernel (m, n, k) can start when: - // - kernel (m, n, k-1) has finished - // - lhs packing (m, k) has finished - // - rhs packing (n, k) has finished - // Lhs/rhs packing can start when: - // - all k-1 packing has finished (artificially imposed to limit amount of - // parallel packing) - // - // On top of that we limit runnable tasks to two consecutive k slices. - // This is done to limit amount of memory we need for packed lhs/rhs - // (for each k slice we need m*bk + n*bk memory in packed_lhs_/packed_rhs_). - // - // state_switch_ tracks when we are ready to switch to the next k slice. - // state_kernel_[m][n] tracks when we are ready to kick off kernel (m, n). - // These variable are rolling over 3 consecutive k slices: first two we are - // actively executing + one to track completion of kernels in the second - // slice. - static const Index P = 3; - void* packed_mem_; - std::vector packed_lhs_[P - 1]; - std::vector packed_rhs_[P - 1]; - std::atomic** state_kernel_[P]; - // state_switch_ is frequently modified by worker threads, while other - // fields are read-only after constructor. Let's move it to a separate cache - // line to reduce cache-coherency traffic. - char pad_[128]; - std::atomic state_packing_ready_[P]; - std::atomic state_switch_[P]; - - void pack_lhs(Index m, Index k) { - const Index mend = m * gm_ + gm(m); - for (Index m1 = m * gm_; m1 < mend; m1++) - LhsPacker()(packed_lhs_[k % (P - 1)][m1], - lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1)); - - if (!parallel_pack_ && shard_by_col_) { - signal_packing(k); - } else { - signal_switch(k + 1); - for (Index n = nn_ - 1; n >= 0; n--) signal_kernel(m, n, k, n == 0); - } - } - - void pack_rhs(Index n, Index k) { - const Index nend = n * gn_ + gn(n); - for (Index n1 = n * gn_; n1 < nend; n1++) { - if (k == 0) { - // Zero the output memory in parallel. - // On 10000x2x10000 mm zeroing can easily take half of time. - // Zero (bn x m) row. Safe to do here because all kernels that will - // write to this memory depend on completion of this task. - // Note: don't call device_.memset() here. device_.memset() blocks on - // thread pool worker thread, which can lead to underutilization and - // deadlocks. - memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar)); - } - RhsPacker()(packed_rhs_[k % (P - 1)][n1], - rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1)); - } - - if (parallel_pack_ || shard_by_col_) { - signal_switch(k + 1); - for (Index m = nm_ - 1; m >= 0; m--) signal_kernel(m, n, k, m == 0); - } else { - signal_packing(k); - } - } - - void kernel(Index m, Index n, Index k) { - // Note: order of iteration matters here. Iteration over m is innermost - // because we want to reuse the same packed rhs in consequetive tasks - // (rhs fits into L2$ while lhs only into L3$). - const Index nend = n * gn_ + gn(n); - const Index mend = m * gm_ + gm(m); - if (shard_by_col_) { - for (Index n1 = n * gn_; n1 < nend; n1++) { - for (Index m1 = m * gm_; m1 < mend; m1++) - GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_), - packed_lhs_[k % (P - 1)][m1], - packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), - Scalar(1), -1, -1, 0, 0); - } - } else { - for (Index m1 = m * gm_; m1 < mend; m1++) - for (Index n1 = n * gn_; n1 < nend; n1++) { - GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_), - packed_lhs_[k % (P - 1)][m1], - packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), - Scalar(1), -1, -1, 0, 0); - } - } - signal_kernel(m, n, k + 1, false); - signal_switch(k + 2); - } - - void signal_packing(Index k) { - eigen_assert(!parallel_pack_); - Index s = state_packing_ready_[k % P].fetch_sub(1); - eigen_assert(s > 0); - if (s != 1) return; - state_packing_ready_[k % P] = shard_by_col_ ? nm_ : nn_; - enqueue_packing(k, shard_by_col_); - } - - void signal_kernel(Index m, Index n, Index k, bool sync) { - std::atomic* state = &state_kernel_[k % P][m][n]; - Index s = state->load(); - eigen_assert(s > 0); - if (s != 1 && state->fetch_sub(1) != 1) return; - state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed); - if (sync) - kernel(m, n, k); - else - device_.enqueueNoNotification([=]() { kernel(m, n, k); }); - } - - void signal_switch(Index k, Index v = 1) { - Index s = state_switch_[k % P].fetch_sub(v); - eigen_assert(s >= v); - if (s != v) return; - - // Ready to switch to the next k slice. - // Reset counter for the next iteration. - state_switch_[k % P] = - (parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)) + - nm_ * nn_; - if (k < nk_) { - // Issue lhs/rhs packing. Their completion will in turn kick off - // kernels. - if (parallel_pack_) { - enqueue_packing(k, !shard_by_col_); - enqueue_packing(k, shard_by_col_); - } else if (shard_by_col_) { - enqueue_packing(k, false); - } else { - enqueue_packing(k, true); - } - - // Termination handling. - // Because kernel completion signals k + 2 switch, we need to finish nk - // + 2 slices without issuing any tasks on nk + 1 slice. So here we - // pretend that all nk + 1 packing tasks just finish instantly; so that - // nk + 2 switch only waits for completion of nk kernels. - } else if (k == nk_) { - signal_switch(k + 1, - parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)); - } else { - done_.Notify(); - } - } - - // Enqueue all rhs/lhs packing for k-th slice. - void enqueue_packing(Index k, bool rhs) { - enqueue_packing_helper(0, rhs ? nn_ : nm_, k, rhs); - } - - void enqueue_packing_helper(Index start, Index end, Index k, bool rhs) { - if (end - start == 1) { - if (rhs) - pack_rhs(start, k); - else - pack_lhs(start, k); - } else { - Index mid = (start + end) / 2; - device_.enqueueNoNotification( - [=]() { enqueue_packing_helper(mid, end, k, rhs); }); - device_.enqueueNoNotification( - [=]() { enqueue_packing_helper(start, mid, k, rhs); }); - } - } - - // Block sizes with accounting for potentially incomplete last block. - Index bm(Index m) const { return m + 1 < nm0_ ? bm_ : m_ + bm_ - bm_ * nm0_; } - Index bn(Index n) const { return n + 1 < nn0_ ? bn_ : n_ + bn_ - bn_ * nn0_; } - Index bk(Index k) const { return k + 1 < nk_ ? bk_ : k_ + bk_ - bk_ * nk_; } - // Task grain sizes accounting for potentially incomplete last task. - Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; } - Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; } - - Context(const Context&) = delete; - void operator=(const Context&) = delete; - }; - - // Decide whether we want to shard m x n contraction by columns or by rows. - static bool shardByCol(Index m, Index n, Index num_threads) { - // Note: we are comparing both n and m against Traits::nr, it is not - // a mistake. We are trying to figure out how both n and m will fit into - // the main sharding dimension. - - // Sharding by column is the default - // ... unless there is enough data for vectorization over rows - if (m / num_threads >= Traits::nr && - // and not enough data for vectorization over columns - (n / num_threads < Traits::nr || - // ... or barely enough data for vectorization over columns, - // but it is not evenly dividable across threads - (n / num_threads < 4 * Traits::nr && - (n % (num_threads * Traits::nr)) != 0 && - // ... and it is evenly dividable across threads for rows - ((m % (num_threads * Traits::nr)) == 0 || - // .. or it is not evenly dividable for both dimensions but - // there is much more data over rows so that corner effects are - // mitigated. - (m / n >= 6))))) - return false; - // Wait, or if matrices are just substantially prolonged over the other - // dimension. - if (n / num_threads < 16 * Traits::nr && m > n * 32) return false; - return true; - } - - Index coarsenM(Index m, Index n, Index bm, Index bn, Index bk, Index gn, - int num_threads, bool shard_by_col) const { - Index gm = 1; - Index gm1 = 1; - Index nm0 = divup(m, bm); - Index nm1 = nm0; - for (;;) { - // Find the next candidate for m grain size. It needs to result in - // different number of blocks. E.g. if we have 10 kernels, we want to try - // 5 and 10, but not 6, 7, 8 and 9. - while (gm1 <= nm0 && nm1 == divup(nm0, gm1)) gm1++; - if (gm1 > nm0) break; - // Check the candidate. - int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads, - shard_by_col); - if (res < 0) break; - nm1 = divup(nm0, gm1); - if (res == 0) continue; - // Commit new grain size. - gm = gm1; - } - return gm; - } - - Index coarsenN(Index m, Index n, Index bm, Index bn, Index bk, Index gm, - int num_threads, bool shard_by_col) const { - Index gn = 1; - Index gn1 = 1; - Index nn0 = divup(n, bn); - Index nn1 = nn0; - for (;;) { - while (gn1 <= nn0 && nn1 == divup(nn0, gn1)) gn1++; - if (gn1 > nn0) break; - int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads, - shard_by_col); - if (res < 0) break; - nn1 = divup(nn0, gn1); - if (res == 0) continue; - gn = gn1; - } - return gn; - } - - // checkGrain checks whether grain (gm, gn) is suitable and is better than - // (oldgm, oldgn). - int checkGrain(Index m, Index n, Index bm, Index bn, Index bk, Index gm, - Index gn, Index oldgm, Index oldgn, int num_threads, - bool shard_by_col) const { - const TensorOpCost cost = - contractionCost(bm * gm, bn * gn, bm, bn, bk, shard_by_col, true); - double taskSize = TensorCostModel::taskSize( - static_cast(bm) * gm * bn * gn, cost); - // If the task is too small, then we agree on it regardless of anything - // else. Otherwise synchronization overheads will dominate. - if (taskSize < 1) return 1; - // If it is too large, then we reject it and all larger tasks. - if (taskSize > 2) return -1; - // Now we are in presumably good task size range. - // The main deciding factor here is parallelism. Consider that we have 12 - // kernels and 4 threads. Grains of 2, 3 and 4 all yield good task sizes. - // But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4 - // of cores will be busy). While grain size 3 gives us 4 tasks, which gives - // us parallelism of 1 (we can load all cores). - Index nm0 = divup(m, bm); - Index nn0 = divup(n, bn); - Index new_tasks = divup(nm0, gm) * divup(nn0, gn); - double new_parallelism = static_cast(new_tasks) / - (divup(new_tasks, num_threads) * num_threads); - Index old_tasks = divup(nm0, oldgm) * divup(nn0, oldgn); - double old_parallelism = static_cast(old_tasks) / - (divup(old_tasks, num_threads) * num_threads); - if (new_parallelism > old_parallelism || new_parallelism == 1) return 1; - return 0; - } - -#else // EIGEN_USE_SIMPLE_THREAD_POOL - - template - void evalProduct(Scalar* buffer) const { - if (this->m_j_size == 1) { - this->template evalGemv(buffer); - return; - } - - evalGemm(buffer); - } - - template - void evalGemm(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; - - // rows in left side - const Index m = this->m_i_size; - - // columns in right side - const Index n = this->m_j_size; - - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - - - const int lhs_packet_size = internal::unpacket_traits::size; - const int rhs_packet_size = internal::unpacket_traits::size; - - typedef internal::TensorContractionInputMapper LhsMapper; - - typedef internal::TensorContractionInputMapper RhsMapper; - - typedef internal::blas_data_mapper OutputMapper; - - // TODO: packing could be faster sometimes if we supported row major tensor mappers - typedef internal::gemm_pack_lhs LhsPacker; - typedef internal::gemm_pack_rhs RhsPacker; - - // TODO: replace false, false with conjugate values? - typedef internal::gebp_kernel GebpKernel; - - typedef internal::packLhsArg packLArg; - typedef internal::packRhsAndKernelArg packRKArg; - - // initialize data mappers - LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, - this->m_left_contracting_strides, this->m_k_strides); - - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, - this->m_right_contracting_strides, this->m_k_strides); - - OutputMapper output(buffer, m); - - // compute block sizes (which depend on number of threads) - const Index num_threads = this->m_device.numThreads(); - internal::TensorContractionBlocking blocking(k, m, n, num_threads); - Index mc = blocking.mc(); - Index nc = blocking.nc(); - Index kc = blocking.kc(); - eigen_assert(mc <= m); - eigen_assert(nc <= n); - eigen_assert(kc <= k); - -#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b)) - const Index k_blocks = CEIL_DIV(k, kc); - const Index n_blocks = CEIL_DIV(n, nc); - const Index m_blocks = CEIL_DIV(m, mc); - const Index sizeA = mc * kc; - const Index sizeB = kc * nc; - - /* cout << "m: " << m << " n: " << n << " k: " << k << endl; - cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl; - cout << "m_blocks: " << m_blocks << " n_blocks: " << n_blocks << " k_blocks: " << k_blocks << endl; - cout << "num threads: " << num_threads << endl; - */ - - // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB - // aren't 16 byte aligned segfaults will happen due to SIMD instructions - // note: You can get away with allocating just a single blockA and offsets and meet the - // the alignment requirements with the assumption that - // (Traits::mr * sizeof(ResScalar)) % 16 == 0 - const Index numBlockAs = numext::mini(num_threads, m_blocks); - MaxSizeVector blockAs(num_threads); - for (int i = 0; i < num_threads; i++) { - blockAs.push_back(static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar)))); - } - - // To circumvent alignment issues, I'm just going to separately allocate the memory for each thread - // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful. - // Other options: (1) reuse memory when a thread finishes. con: tricky - // (2) allocate block B memory in each thread. con: overhead - MaxSizeVector blockBs(n_blocks); - for (int i = 0; i < n_blocks; i++) { - blockBs.push_back(static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar)))); - } - - // lhs_notifications starts with all null Notifications - MaxSizeVector lhs_notifications(num_threads, nullptr); - - // this should really be numBlockAs * n_blocks; - const Index num_kernel_notifications = num_threads * n_blocks; - MaxSizeVector kernel_notifications(num_kernel_notifications, - nullptr); - - for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) { - const Index k_start = k_block_idx * kc; - // make sure we don't overshoot right edge of left matrix - const Index actual_kc = numext::mini(k_start + kc, k) - k_start; - - for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) { - const Index num_blocks = numext::mini(m_blocks-m_block_idx, numBlockAs); - - for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) { - const Index m_start = mt_block_idx * mc; - const Index actual_mc = numext::mini(m_start + mc, m) - m_start; - eigen_assert(actual_mc > 0); - - Index blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads; - - for (int i = 0; i < n_blocks; ++i) { - Index notification_id = (blockAId * n_blocks + i); - // Wait for any current kernels using this slot to complete - // before using it. - if (kernel_notifications[notification_id]) { - wait_until_ready(kernel_notifications[notification_id]); - delete kernel_notifications[notification_id]; - } - kernel_notifications[notification_id] = new Notification(); - } - const packLArg arg = { - blockAs[blockAId], // blockA - lhs, // lhs - m_start, // m - k_start, // k - actual_mc, // mc - actual_kc, // kc - }; - - // Delete any existing notification since we may be - // replacing it. The algorithm should ensure that there are - // no existing waiters on this notification. - delete lhs_notifications[blockAId]; - lhs_notifications[blockAId] = - this->m_device.enqueue(&Self::packLhs, arg); - } - - // now start kernels. - const Index m_base_start = m_block_idx * mc; - const bool need_to_pack = m_block_idx == 0; - - for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) { - const Index n_start = n_block_idx * nc; - const Index actual_nc = numext::mini(n_start + nc, n) - n_start; - - // first make sure the previous kernels are all done before overwriting rhs. Also wait if - // we're going to start new k. In both cases need_to_pack is true. - if (need_to_pack) { - for (Index i = num_blocks; i < num_threads; ++i) { - Index blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads; - Index future_id = (blockAId * n_blocks + n_block_idx); - wait_until_ready(kernel_notifications[future_id]); - } - } - - packRKArg arg = { - &blockAs, // blockA - blockBs[n_block_idx], // blockB - rhs, // rhs - output, // output - m_base_start, // m - k_start, // k - n_start, // n - mc, // mc - actual_kc, // kc - actual_nc, // nc - num_threads, - numBlockAs, - m, - k_block_idx, - m_block_idx, - n_block_idx, // n_block_idx - m_blocks, // m_blocks - n_blocks, // n_blocks - &kernel_notifications, // kernel notifications - &lhs_notifications, // lhs notifications - need_to_pack, // need_to_pack - }; - - // We asynchronously kick off this function, which ends up - // notifying the appropriate kernel_notifications objects, - // which this thread waits on before exiting. - this->m_device.enqueueNoNotification(&Self::packRhsAndKernel, arg); - } - } - } - - // Make sure all the kernels are done. - for (size_t i = 0; i < kernel_notifications.size(); ++i) { - wait_until_ready(kernel_notifications[i]); - delete kernel_notifications[i]; - } - - // No need to wait for lhs notifications since they should have - // already been waited on. Just clean them up. - for (size_t i = 0; i < lhs_notifications.size(); ++i) { - delete lhs_notifications[i]; - } - - // deallocate all of the memory for both A and B's - for (size_t i = 0; i < blockAs.size(); i++) { - this->m_device.deallocate(blockAs[i]); - } - for (size_t i = 0; i < blockBs.size(); i++) { - this->m_device.deallocate(blockBs[i]); - } - -#undef CEIL_DIV - } - - /* - * Packs a LHS block of size (mt, kc) starting at lhs(m, k). Before packing - * the LHS block, check that all of the kernels that worked on the same - * mt_block_idx in the previous m_block are done. - */ - template - static void packLhs(const packLArg arg) { - // perform actual packing - LhsPacker pack_lhs; - pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m_start, arg.k_start), arg.kc, arg.mc); - } - - /* - * Packs a RHS block of size (kc, nc) starting at (k, n) after checking that - * all kernels in the previous block are done. - * Then for each LHS future, we wait on the future and then call GEBP - * on the area packed by the future (which starts at - * blockA + future_idx * mt * kc) on the LHS and with the full packed - * RHS block. - * The output of this GEBP is written to output(m + i * mt, n). - */ - template - static void packRhsAndKernel(packRKArg arg) { - if (arg.need_to_pack) { - RhsPacker pack_rhs; - pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k, arg.n), arg.kc, arg.nc); - } - - GebpKernel gebp; - for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) { - const Index m_base_start = arg.m + arg.mc*mt_block_idx; - if (m_base_start < arg.max_m) { - Index blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads; - wait_until_ready((*arg.lhs_notifications)[blockAId]); - const Index actual_mc = numext::mini(m_base_start + arg.mc, arg.max_m) - m_base_start; - gebp(arg.output.getSubMapper(m_base_start, arg.n), - (*arg.blockAs)[blockAId], arg.blockB, - actual_mc, arg.kc, arg.nc, Scalar(1), -1, -1, 0, 0); - - // Notify that the kernel is done. - const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx; - (*arg.kernel_notifications)[set_idx]->Notify(); - } - } - } -#endif // EIGEN_USE_SIMPLE_THREAD_POOL - - TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk, - bool shard_by_col, bool prepacked) const { - const int packed_size = std::min(PacketType::size, - PacketType::size); - const int output_packet_size = internal::unpacket_traits::size; - const double kd = static_cast(bk); - // Peak VFMA bandwidth is 0.5. However if we have not enough data for - // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined - // experimentally. - double computeBandwidth = bk == 1 ? 4.0 : - (shard_by_col ? bn : bm) < Traits::nr || - (shard_by_col ? bm : bn) < Traits::mr ? 2.0 : 0.5; -#ifndef EIGEN_VECTORIZE_FMA - // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors. - // However for MULPS/ADDPS we have dependent sequence of 2 such instructions, - // so overall bandwidth is 1.0. - if (computeBandwidth == 0.5) computeBandwidth = 1.0; -#endif - // Computations. - TensorOpCost cost = TensorOpCost(0, 0, kd * computeBandwidth, true, packed_size); - // Output stores. - cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size); - if (prepacked) { - // Packing and kernels are executed in different tasks. When we calculate - // task grain size we look only at kernel cost assuming that kernel - // is more expensive than packing. - return cost; - } - // Lhs/rhs loads + computations. - TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * (kd / n); - TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * (kd / m); - // Lhs packing memory cost does not contribute considerably to overall - // execution time because lhs is prefetched early and accessed sequentially. - if (shard_by_col) - lhsCost.dropMemoryCost(); - else - rhsCost.dropMemoryCost(); - return cost + lhsCost + rhsCost; - } -}; - -} // end namespace Eigen - -#endif // EIGEN_USE_THREADS -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h deleted file mode 100644 index 860a694..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ /dev/null @@ -1,279 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H - -namespace Eigen { - -/** \class TensorConversionOp - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor conversion class. This class makes it possible to vectorize - * type casting operations when the number of scalars per packet in the source - * and the destination type differ - */ -namespace internal { -template -struct traits > -{ - // Type promotion to handle the case where the types of the lhs and the rhs are different. - typedef TargetType Scalar; - typedef typename traits::StorageKind StorageKind; - typedef typename traits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = traits::NumDimensions; - static const int Layout = traits::Layout; - enum { Flags = 0 }; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorConversionOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorConversionOp type; -}; - -} // end namespace internal - - -template -struct PacketConverter { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PacketConverter(const TensorEvaluator& impl) - : m_impl(impl) {} - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { - return internal::pcast(m_impl.template packet(index)); - } - - private: - const TensorEvaluator& m_impl; -}; - - -template -struct PacketConverter { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PacketConverter(const TensorEvaluator& impl) - : m_impl(impl) {} - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { - const int SrcPacketSize = internal::unpacket_traits::size; - - SrcPacket src1 = m_impl.template packet(index); - SrcPacket src2 = m_impl.template packet(index + SrcPacketSize); - TgtPacket result = internal::pcast(src1, src2); - return result; - } - - private: - const TensorEvaluator& m_impl; -}; - -template -struct PacketConverter { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PacketConverter(const TensorEvaluator& impl) - : m_impl(impl) {} - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { - const int SrcPacketSize = internal::unpacket_traits::size; - - SrcPacket src1 = m_impl.template packet(index); - SrcPacket src2 = m_impl.template packet(index + SrcPacketSize); - SrcPacket src3 = m_impl.template packet(index + 2 * SrcPacketSize); - SrcPacket src4 = m_impl.template packet(index + 3 * SrcPacketSize); - TgtPacket result = internal::pcast(src1, src2, src3, src4); - return result; - } - - private: - const TensorEvaluator& m_impl; -}; - -template -struct PacketConverter { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PacketConverter(const TensorEvaluator& impl) - : m_impl(impl), m_maxIndex(impl.dimensions().TotalSize()) {} - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { - const int SrcPacketSize = internal::unpacket_traits::size; - // Only call m_impl.packet() when we have direct access to the underlying data. This - // ensures that we don't compute the subexpression twice. We may however load some - // coefficients twice, but in practice this doesn't negatively impact performance. - if (m_impl.data() && (index + SrcPacketSize < m_maxIndex)) { - // Force unaligned memory loads since we can't ensure alignment anymore - return internal::pcast(m_impl.template packet(index)); - } else { - const int TgtPacketSize = internal::unpacket_traits::size; - typedef typename internal::unpacket_traits::type SrcType; - typedef typename internal::unpacket_traits::type TgtType; - internal::scalar_cast_op converter; - EIGEN_ALIGN_MAX typename internal::unpacket_traits::type values[TgtPacketSize]; - for (int i = 0; i < TgtPacketSize; ++i) { - values[i] = converter(m_impl.coeff(index+i)); - } - TgtPacket rslt = internal::pload(values); - return rslt; - } - } - - private: - const TensorEvaluator& m_impl; - const typename TensorEvaluator::Index m_maxIndex; -}; - -template -class TensorConversionOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename internal::traits::Scalar Scalar; - typedef typename internal::traits::StorageKind StorageKind; - typedef typename internal::traits::Index Index; - typedef typename internal::nested::type Nested; - typedef Scalar CoeffReturnType; - typedef typename NumTraits::Real RealScalar; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConversionOp(const XprType& xpr) - : m_xpr(xpr) {} - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; -}; - -template struct ConversionSubExprEval { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar*) { - impl.evalSubExprsIfNeeded(NULL); - return true; - } -}; - -template struct ConversionSubExprEval { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar* data) { - return impl.evalSubExprsIfNeeded(data); - } -}; - - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorConversionOp XprType; - typedef typename XprType::Index Index; - typedef typename TensorEvaluator::Dimensions Dimensions; - typedef TargetType Scalar; - typedef TargetType CoeffReturnType; - typedef typename internal::remove_all::Scalar>::type SrcType; - typedef typename PacketType::type PacketReturnType; - typedef typename PacketType::type PacketSourceType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = false, - PacketAccess = true, - Layout = TensorEvaluator::Layout, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device) - { - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) - { - return ConversionSubExprEval::value, TensorEvaluator, Scalar>::run(m_impl, data); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() - { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - internal::scalar_cast_op converter; - return converter(m_impl.coeff(index)); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - const bool Vectorizable = TensorEvaluator::PacketAccess & - internal::type_casting_traits::VectorizedCast; - return PacketConv::run(m_impl, index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - const double cast_cost = TensorOpCost::CastCost(); - if (vectorized) { - const double SrcCoeffRatio = - internal::type_casting_traits::SrcCoeffRatio; - const double TgtCoeffRatio = - internal::type_casting_traits::TgtCoeffRatio; - return m_impl.costPerCoeff(vectorized) * (SrcCoeffRatio / PacketSize) + - TensorOpCost(0, 0, TgtCoeffRatio * (cast_cost / PacketSize)); - } else { - return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, cast_cost); - } - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - - protected: - template - struct PacketConv { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator& impl, Index index) { - internal::scalar_cast_op converter; - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - for (int i = 0; i < PacketSize; ++i) { - values[i] = converter(impl.coeff(index+i)); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - }; - - template - struct PacketConv { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator& impl, Index index) { - const int SrcCoeffRatio = internal::type_casting_traits::SrcCoeffRatio; - const int TgtCoeffRatio = internal::type_casting_traits::TgtCoeffRatio; - PacketConverter, PacketSourceType, PacketReturnType, - SrcCoeffRatio, TgtCoeffRatio> converter(impl); - return converter.template packet(index); - } - }; - - TensorEvaluator m_impl; -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h deleted file mode 100644 index abdf742..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ /dev/null @@ -1,1104 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H - -namespace Eigen { - -/** \class TensorConvolution - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor convolution class. - * - * - */ -namespace internal { - -template -class IndexMapper { - public: - IndexMapper(const InputDims& input_dims, const array& kernel_dims, - const array& indices) { - - array dimensions = input_dims; - for (int i = 0; i < NumKernelDims; ++i) { - const Index index = indices[i]; - const Index input_dim = input_dims[index]; - const Index kernel_dim = kernel_dims[i]; - const Index result_dim = input_dim - kernel_dim + 1; - dimensions[index] = result_dim; - } - - array inputStrides; - array outputStrides; - if (static_cast(Layout) == static_cast(ColMajor)) { - inputStrides[0] = 1; - outputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - inputStrides[i] = inputStrides[i-1] * input_dims[i-1]; - outputStrides[i] = outputStrides[i-1] * dimensions[i-1]; - } - } else { - inputStrides[NumDims - 1] = 1; - outputStrides[NumDims - 1] = 1; - for (int i = static_cast(NumDims) - 2; i >= 0; --i) { - inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1]; - outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1]; - } - } - - array cudaInputDimensions; - array cudaOutputDimensions; - array tmp = dimensions; - array ordering; - const size_t offset = static_cast(Layout) == static_cast(ColMajor) - ? 0 - : NumDims - NumKernelDims; - for (int i = 0; i < NumKernelDims; ++i) { - const Index index = i + offset; - ordering[index] = indices[i]; - tmp[indices[i]] = -1; - cudaInputDimensions[index] = input_dims[indices[i]]; - cudaOutputDimensions[index] = dimensions[indices[i]]; - } - - int written = static_cast(Layout) == static_cast(ColMajor) - ? NumKernelDims - : 0; - for (int i = 0; i < NumDims; ++i) { - if (tmp[i] >= 0) { - ordering[written] = i; - cudaInputDimensions[written] = input_dims[i]; - cudaOutputDimensions[written] = dimensions[i]; - ++written; - } - } - - for (int i = 0; i < NumDims; ++i) { - m_inputStrides[i] = inputStrides[ordering[i]]; - m_outputStrides[i] = outputStrides[ordering[i]]; - } - - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = 0; i < NumDims; ++i) { - if (i > NumKernelDims) { - m_cudaInputStrides[i] = - m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1]; - m_cudaOutputStrides[i] = - m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1]; - } else { - m_cudaInputStrides[i] = 1; - m_cudaOutputStrides[i] = 1; - } - } - } else { - for (int i = NumDims - 1; i >= 0; --i) { - if (i + 1 < offset) { - m_cudaInputStrides[i] = - m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1]; - m_cudaOutputStrides[i] = - m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1]; - } else { - m_cudaInputStrides[i] = 1; - m_cudaOutputStrides[i] = 1; - } - } - } - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const { - Index inputIndex = 0; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int d = NumDims - 1; d > NumKernelDims; --d) { - const Index idx = p / m_cudaInputStrides[d]; - inputIndex += idx * m_inputStrides[d]; - p -= idx * m_cudaInputStrides[d]; - } - inputIndex += p * m_inputStrides[NumKernelDims]; - } else { - std::ptrdiff_t limit = 0; - if (NumKernelDims < NumDims) { - limit = NumDims - NumKernelDims - 1; - } - for (int d = 0; d < limit; ++d) { - const Index idx = p / m_cudaInputStrides[d]; - inputIndex += idx * m_inputStrides[d]; - p -= idx * m_cudaInputStrides[d]; - } - inputIndex += p * m_inputStrides[limit]; - } - return inputIndex; - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const { - Index outputIndex = 0; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int d = NumDims - 1; d > NumKernelDims; --d) { - const Index idx = p / m_cudaOutputStrides[d]; - outputIndex += idx * m_outputStrides[d]; - p -= idx * m_cudaOutputStrides[d]; - } - outputIndex += p * m_outputStrides[NumKernelDims]; - } else { - std::ptrdiff_t limit = 0; - if (NumKernelDims < NumDims) { - limit = NumDims - NumKernelDims - 1; - } - for (int d = 0; d < limit; ++d) { - const Index idx = p / m_cudaOutputStrides[d]; - outputIndex += idx * m_outputStrides[d]; - p -= idx * m_cudaOutputStrides[d]; - } - outputIndex += p * m_outputStrides[limit]; - } - return outputIndex; - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const { - const size_t offset = static_cast(Layout) == static_cast(ColMajor) - ? 0 - : NumDims - NumKernelDims; - return i * m_inputStrides[offset]; - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const { - const size_t offset = static_cast(Layout) == static_cast(ColMajor) - ? 0 - : NumDims - NumKernelDims; - return i * m_outputStrides[offset]; - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const { - const size_t offset = static_cast(Layout) == static_cast(ColMajor) - ? 0 - : NumDims - NumKernelDims; - return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1]; - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const { - const size_t offset = static_cast(Layout) == static_cast(ColMajor) - ? 0 - : NumDims - NumKernelDims; - return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1]; - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const { - const size_t offset = static_cast(Layout) == static_cast(ColMajor) - ? 0 - : NumDims - NumKernelDims; - return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] + - k * m_inputStrides[offset + 2]; - } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { - const size_t offset = static_cast(Layout) == static_cast(ColMajor) - ? 0 - : NumDims - NumKernelDims; - return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] + - k * m_outputStrides[offset + 2]; - } - - private: - static const int NumDims = internal::array_size::value; - array m_inputStrides; - array m_outputStrides; - array m_cudaInputStrides; - array m_cudaOutputStrides; -}; - - - -template -struct traits > -{ - // Type promotion to handle the case where the types of the lhs and the rhs are different. - typedef typename promote_storage_type::ret Scalar; - typedef typename promote_storage_type::StorageKind, - typename traits::StorageKind>::ret StorageKind; - typedef typename promote_index_type::Index, - typename traits::Index>::type Index; - typedef typename InputXprType::Nested LhsNested; - typedef typename KernelXprType::Nested RhsNested; - typedef typename remove_reference::type _LhsNested; - typedef typename remove_reference::type _RhsNested; - static const int NumDimensions = traits::NumDimensions; - static const int Layout = traits::Layout; - - enum { - Flags = 0 - }; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorConvolutionOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorConvolutionOp type; -}; - -} // end namespace internal - - - -template -class TensorConvolutionOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename internal::promote_storage_type::ret CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims) - : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Indices& indices() const { return m_indices; } - - /** \returns the nested expressions */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const typename internal::remove_all::type& - inputExpression() const { return m_input_xpr; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const typename internal::remove_all::type& - kernelExpression() const { return m_kernel_xpr; } - - protected: - typename InputXprType::Nested m_input_xpr; - typename KernelXprType::Nested m_kernel_xpr; - const Indices m_indices; -}; - - -template -struct TensorEvaluator, Device> -{ - typedef TensorConvolutionOp XprType; - - static const int NumDims = internal::array_size::Dimensions>::value; - static const int NumKernelDims = internal::array_size::value; - typedef typename XprType::Index Index; - typedef DSizes Dimensions; - - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device) - { - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); - - const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); - const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); - - if (static_cast(Layout) == static_cast(ColMajor)) { - m_inputStride[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1]; - } - } else { - m_inputStride[NumDims - 1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1]; - } - } - - m_dimensions = m_inputImpl.dimensions(); - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = 0; i < NumKernelDims; ++i) { - const Index index = op.indices()[i]; - const Index input_dim = input_dims[index]; - const Index kernel_dim = kernel_dims[i]; - const Index result_dim = input_dim - kernel_dim + 1; - m_dimensions[index] = result_dim; - if (i > 0) { - m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1]; - } else { - m_kernelStride[0] = 1; - } - m_indexStride[i] = m_inputStride[index]; - } - - m_outputStride[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1]; - } - } else { - for (int i = NumKernelDims - 1; i >= 0; --i) { - const Index index = op.indices()[i]; - const Index input_dim = input_dims[index]; - const Index kernel_dim = kernel_dims[i]; - const Index result_dim = input_dim - kernel_dim + 1; - m_dimensions[index] = result_dim; - if (i < NumKernelDims - 1) { - m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1]; - } else { - m_kernelStride[NumKernelDims - 1] = 1; - } - m_indexStride[i] = m_inputStride[index]; - } - - m_outputStride[NumDims - 1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1]; - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { - m_inputImpl.evalSubExprsIfNeeded(NULL); - preloadKernel(); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_inputImpl.cleanup(); - if (m_local_kernel) { - m_device.deallocate((void*)m_kernel); - m_local_kernel = false; - } - m_kernel = NULL; - } - - void evalTo(typename XprType::Scalar* buffer) { - evalSubExprsIfNeeded(NULL); - for (int i = 0; i < dimensions().TotalSize(); ++i) { - buffer[i] += coeff(i); - } - cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - CoeffReturnType result = CoeffReturnType(0); - convolve(firstInput(index), 0, NumKernelDims-1, result); - return result; - } - - template - EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const - { - Index indices[2] = {index, index+PacketSize-1}; - Index startInputs[2] = {0, 0}; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / m_outputStride[i]; - const Index idx1 = indices[1] / m_outputStride[i]; - startInputs[0] += idx0 * m_inputStride[i]; - startInputs[1] += idx1 * m_inputStride[i]; - indices[0] -= idx0 * m_outputStride[i]; - indices[1] -= idx1 * m_outputStride[i]; - } - } else { - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx0 = indices[0] / m_outputStride[i]; - const Index idx1 = indices[1] / m_outputStride[i]; - startInputs[0] += idx0 * m_inputStride[i]; - startInputs[1] += idx1 * m_inputStride[i]; - indices[0] -= idx0 * m_outputStride[i]; - indices[1] -= idx1 * m_outputStride[i]; - } - } - startInputs[0] += indices[0]; - startInputs[1] += indices[1]; - - if (startInputs[1]-startInputs[0] == PacketSize-1) { - PacketReturnType result = internal::pset1(0); - convolvePacket(startInputs[0], 0, NumKernelDims-1, result); - return result; - } else { - EIGEN_ALIGN_MAX Scalar data[PacketSize]; - data[0] = Scalar(0); - convolve(startInputs[0], 0, NumKernelDims-1, data[0]); - for (int i = 1; i < PacketSize-1; ++i) { - data[i] = Scalar(0); - convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]); - } - data[PacketSize-1] = Scalar(0); - convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]); - return internal::pload(data); - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - const double kernel_size = m_kernelImpl.dimensions().TotalSize(); - // We ignore the use of fused multiply-add. - const double convolve_compute_cost = - TensorOpCost::AddCost() + TensorOpCost::MulCost(); - const double firstIndex_compute_cost = - NumDims * - (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + - TensorOpCost::DivCost()); - return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + - kernel_size * (m_inputImpl.costPerCoeff(vectorized) + - m_kernelImpl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, convolve_compute_cost, vectorized, - PacketSize)); - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - - private: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { - Index startInput = 0; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStride[i]; - startInput += idx * m_inputStride[i]; - index -= idx * m_outputStride[i]; - } - } else { - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_outputStride[i]; - startInput += idx * m_inputStride[i]; - index -= idx * m_outputStride[i]; - } - } - startInput += index; - return startInput; - } - - EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const { - for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { - const Index input = firstIndex + j * m_indexStride[DimIndex]; - const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; - if (DimIndex > 0) { - convolve(input, kernel, DimIndex-1, accum); - } else { - accum += m_inputImpl.coeff(input) * m_kernel[kernel]; - } - } - } - - template - EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const { - for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { - const Index input = firstIndex + j * m_indexStride[DimIndex]; - const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; - if (DimIndex > 0) { - convolvePacket(input, kernel, DimIndex-1, accum); - } else { - accum = internal::pmadd(m_inputImpl.template packet(input), internal::pset1(m_kernel[kernel]), accum); - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { - // Don't make a local copy of the kernel unless we have to (i.e. it's an - // expression that needs to be evaluated) - const Scalar* in_place = m_kernelImpl.data(); - if (in_place) { - m_kernel = in_place; - m_local_kernel = false; - } else { - size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); - Scalar* local = (Scalar*)m_device.allocate(kernel_sz); - typedef TensorEvalToOp EvalTo; - EvalTo evalToTmp(local, m_kernelArg); - const bool PacketAccess = internal::IsVectorizable::value; - internal::TensorExecutor::run(evalToTmp, m_device); - - m_kernel = local; - m_local_kernel = true; - } - } - - array m_inputStride; - array m_outputStride; - - array m_indexStride; - array m_kernelStride; - TensorEvaluator m_inputImpl; - TensorEvaluator m_kernelImpl; - Dimensions m_dimensions; - - KernelArgType m_kernelArg; - const Scalar* m_kernel; - bool m_local_kernel; - const Device& m_device; -}; - - - - -// Use an optimized implementation of the evaluation code for GPUs whenever possible. -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) - -template -struct GetKernelSize { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const { - return StaticKernelSize; - } -}; -template <> -struct GetKernelSize { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const { - return kernelSize; - } -}; - -template -__global__ void EigenConvolutionKernel1D( - InputEvaluator eval, - const internal::IndexMapper - indexMapper, - const float* __restrict kernel, const int numPlanes, const int numX, - const int maxX, const int kernelSize, float* buffer) { - extern __shared__ float s[]; - - const int first_x = blockIdx.x * maxX; - const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; - const int num_x_input = last_x - first_x + GetKernelSize()(kernelSize); - const int num_x_output = last_x - first_x + 1; - - const int first_plane = blockIdx.y * blockDim.y; - const int plane_stride = blockDim.y * gridDim.y; - - for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) { - // Load inputs to shared memory - const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); - const int plane_kernel_offset = threadIdx.y * num_x_input; - #pragma unroll - for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { - const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x); - s[i + plane_kernel_offset] = eval.coeff(tensor_index); - } - - __syncthreads(); - - // Compute the convolution - const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); - - #pragma unroll - for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { - const int kernel_offset = plane_kernel_offset + i; - float result = 0.0f; - #pragma unroll - for (int k = 0; k < GetKernelSize()(kernelSize); ++k) { - result += s[k + kernel_offset] * kernel[k]; - } - const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x); - buffer[tensor_index] = result; - } - __syncthreads(); - } -}; - -template -__global__ void EigenConvolutionKernel2D( - InputEvaluator eval, - const internal::IndexMapper - indexMapper, - const float* __restrict kernel, const int numPlanes, const int numX, - const int maxX, const int numY, const int maxY, const int kernelSizeX, - const int kernelSizeY, float* buffer) { - extern __shared__ float s[]; - - const int first_x = blockIdx.x * maxX; - const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; - const int num_x_input = last_x - first_x + GetKernelSize()(kernelSizeX); - const int num_x_output = last_x - first_x + 1; - - const int first_y = blockIdx.y * maxY; - const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; - const int num_y_input = last_y - first_y + GetKernelSize()(kernelSizeY); - const int num_y_output = last_y - first_y + 1; - - const int first_plane = blockIdx.z * blockDim.z; - const int plane_stride = blockDim.z * gridDim.z; - - for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) { - - const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); - const int plane_kernel_offset = threadIdx.z * num_y_input; - - // Load inputs to shared memory - #pragma unroll - for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { - const int input_offset = num_x_input * (j + plane_kernel_offset); - #pragma unroll - for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { - const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y); - s[i + input_offset] = eval.coeff(tensor_index); - } - } - - __syncthreads(); - - // Convolution - const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); - - #pragma unroll - for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { - #pragma unroll - for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { - float result = 0.0f; - #pragma unroll - for (int l = 0; l < GetKernelSize()(kernelSizeY); ++l) { - const int kernel_offset = kernelSizeX * l; - const int input_offset = i + num_x_input * (j + l + plane_kernel_offset); - #pragma unroll - for (int k = 0; k < GetKernelSize()(kernelSizeX); ++k) { - result += s[k + input_offset] * kernel[k + kernel_offset]; - } - } - const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y); - buffer[tensor_index] = result; - } - } - - __syncthreads(); - } -}; - -template -__global__ void EigenConvolutionKernel3D( - InputEvaluator eval, - const internal::IndexMapper - indexMapper, - const float* __restrict kernel, const size_t numPlanes, const size_t numX, - const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, - const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, - const size_t kernelSizeZ, float* buffer) { - extern __shared__ float s[]; - - // Load inputs to shared memory - const int first_x = blockIdx.x * maxX; - const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; - const int num_x_input = last_x - first_x + kernelSizeX; - - const int first_y = blockIdx.y * maxY; - const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; - const int num_y_input = last_y - first_y + kernelSizeY; - - const int first_z = blockIdx.z * maxZ; - const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1; - const int num_z_input = last_z - first_z + kernelSizeZ; - - for (int p = 0; p < numPlanes; ++p) { - - const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); - const int plane_kernel_offset = 0; - - for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) { - for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { - for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { - const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); - s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index); - } - } - } - - __syncthreads(); - - // Convolution - const int num_z_output = last_z - first_z + 1; - const int num_y_output = last_y - first_y + 1; - const int num_x_output = last_x - first_x + 1; - const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); - - for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) { - for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { - for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { - float result = 0.0f; - for (int n = 0; n < kernelSizeZ; ++n) { - for (int m = 0; m < kernelSizeY; ++m) { - for (int l = 0; l < kernelSizeX; ++l) { - result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)]; - } - } - } - const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z); - buffer[tensor_index] = result; - } - } - } - __syncthreads(); - } -}; - - - -template -struct TensorEvaluator, GpuDevice> -{ - typedef TensorConvolutionOp XprType; - - static const int NumDims = internal::array_size::Dimensions>::value; - static const int NumKernelDims = internal::array_size::value; - typedef typename XprType::Index Index; - typedef DSizes Dimensions; - typedef typename TensorEvaluator::Dimensions KernelDimensions; - - enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device) - : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) - { - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); - - const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); - const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); - - m_dimensions = m_inputImpl.dimensions(); - for (int i = 0; i < NumKernelDims; ++i) { - const Index index = op.indices()[i]; - const Index input_dim = input_dims[index]; - const Index kernel_dim = kernel_dims[i]; - const Index result_dim = input_dim - kernel_dim + 1; - m_dimensions[index] = result_dim; - } - } - - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef typename InputArgType::Scalar Scalar; - static const int PacketSize = internal::unpacket_traits::size; - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { - preloadKernel(); - m_inputImpl.evalSubExprsIfNeeded(NULL); - if (data) { - executeEval(data); - return false; - } else { - m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)); - executeEval(m_buf); - return true; - } - } - - EIGEN_STRONG_INLINE void cleanup() { - m_inputImpl.cleanup(); - if (m_buf) { - m_device.deallocate(m_buf); - m_buf = NULL; - } - if (m_local_kernel) { - m_device.deallocate((void*)m_kernel); - m_local_kernel = false; - } - m_kernel = NULL; - } - - EIGEN_STRONG_INLINE void preloadKernel() { - // Don't make a local copy of the kernel unless we have to (i.e. it's an - // expression that needs to be evaluated) - const Scalar* in_place = m_kernelImpl.data(); - if (in_place) { - m_kernel = in_place; - m_local_kernel = false; - } else { - size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); - Scalar* local = (Scalar*)m_device.allocate(kernel_sz); - typedef TensorEvalToOp EvalTo; - EvalTo evalToTmp(local, m_kernelArg); - const bool PacketAccess = internal::IsVectorizable::value; - internal::TensorExecutor::run(evalToTmp, m_device); - - m_kernel = local; - m_local_kernel = true; - } - } - - static unsigned int ceil(unsigned int num, unsigned int denom) { - const unsigned int rounded_toward_zero = num / denom; - if (num > rounded_toward_zero * denom) { - return rounded_toward_zero + 1; - } - return rounded_toward_zero; - } - - void executeEval(Scalar* data) const { - typedef typename TensorEvaluator::Dimensions InputDims; - - const int maxSharedMem = m_device.sharedMemPerBlock(); - const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock(); - const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock; - const int numMultiProcessors = m_device.getNumCudaMultiProcessors(); - const int warpSize = 32; - - switch (NumKernelDims) { - case 1: { - const int kernel_size = m_kernelImpl.dimensions().TotalSize(); - - const int numX = dimensions()[m_indices[0]]; - const int numP = dimensions().TotalSize() / numX; - int maxX; - dim3 block_size; - - const int single_stride_dim = - static_cast(Layout) == static_cast(ColMajor) - ? 0 - : m_inputImpl.dimensions().rank() - 1; - if (m_indices[0] == single_stride_dim) { - // Maximum the reuse - const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32; - maxX = numext::mini(inner_dim, numX); - const int maxP = numext::mini(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP); - block_size.x = numext::mini(maxThreadsPerBlock, maxX); - block_size.y = numext::mini(maxThreadsPerBlock / block_size.x, maxP); - } - else { - // Read as much as possible alongside the inner most dimension, that is the plane - const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar)); - const int maxP = numext::mini(inner_dim, numP); - maxX = numext::mini(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX); - - block_size.x = numext::mini(warpSize, maxX); - block_size.y = numext::mini(maxThreadsPerBlock/block_size.x, maxP); - } - - const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar); - assert(shared_mem <= maxSharedMem); - - const int num_x_blocks = ceil(numX, maxX); - const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); - const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks); - - dim3 num_blocks(num_x_blocks, numext::mini(num_y_blocks, ceil(numP, block_size.y))); - - - //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; - - const array indices(m_indices[0]); - const array kernel_dims(m_kernelImpl.dimensions()[0]); - internal::IndexMapper indexMapper( - m_inputImpl.dimensions(), kernel_dims, indices); - switch(kernel_size) { - case 4: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); - break; - } - case 7: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); - break; - } - default: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); - } - } - break; - } - - case 2: { - const int idxX = - static_cast(Layout) == static_cast(ColMajor) ? 0 : 1; - const int idxY = - static_cast(Layout) == static_cast(ColMajor) ? 1 : 0; - const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; - const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; - - const int numX = dimensions()[m_indices[idxX]]; - const int numY = dimensions()[m_indices[idxY]]; - const int numP = dimensions().TotalSize() / (numX*numY); - - const float scaling_factor = sqrtf(static_cast(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x)); - - // Snap maxX to warp size - int inner_dim = ((static_cast(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32; - const int maxX = numext::mini(inner_dim, numX); - const int maxY = numext::mini(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY); - const int maxP = numext::mini(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP); - - dim3 block_size; - block_size.x = numext::mini(1024, maxX); - block_size.y = numext::mini(1024/block_size.x, maxY); - block_size.z = numext::mini(1024/(block_size.x*block_size.y), maxP); - - const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar); - assert(shared_mem <= maxSharedMem); - - const int num_x_blocks = ceil(numX, maxX); - const int num_y_blocks = ceil(numY, maxY); - const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); - const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks); - - dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini(num_z_blocks, ceil(numP, block_size.z))); - - - //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; - - const array indices(m_indices[idxX], m_indices[idxY]); - const array kernel_dims(m_kernelImpl.dimensions()[idxX], - m_kernelImpl.dimensions()[idxY]); - internal::IndexMapper indexMapper( - m_inputImpl.dimensions(), kernel_dims, indices); - switch (kernel_size_x) { - case 4: { - switch (kernel_size_y) { - case 7: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); - break; - } - default: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); - break; - } - } - break; - } - case 7: { - switch (kernel_size_y) { - case 4: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); - break; - } - default: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); - break; - } - } - break; - } - default: { - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); - break; - } - } - break; - } - - case 3: { - const int idxX = - static_cast(Layout) == static_cast(ColMajor) ? 0 : 2; - const int idxY = - static_cast(Layout) == static_cast(ColMajor) ? 1 : 1; - const int idxZ = - static_cast(Layout) == static_cast(ColMajor) ? 2 : 0; - - const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; - const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; - const int kernel_size_z = m_kernelImpl.dimensions()[idxZ]; - - const int numX = dimensions()[m_indices[idxX]]; - const int numY = dimensions()[m_indices[idxY]]; - const int numZ = dimensions()[m_indices[idxZ]]; - const int numP = dimensions().TotalSize() / (numX*numY*numZ); - - const int maxX = numext::mini(128, numext::mini(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX)); - const int maxY = numext::mini(128, numext::mini(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY)); - const int maxZ = numext::mini(128, numext::mini(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ)); - - dim3 block_size; - block_size.x = numext::mini(32, maxX); - block_size.y = numext::mini(32, maxY); - block_size.z = numext::mini(1024/(block_size.x*block_size.y), maxZ); - dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ)); - - const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar); - assert(shared_mem <= maxSharedMem); - - //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; - const array indices(m_indices[idxX], m_indices[idxY], - m_indices[idxZ]); - const array kernel_dims(m_kernelImpl.dimensions()[idxX], - m_kernelImpl.dimensions()[idxY], - m_kernelImpl.dimensions()[idxZ]); - internal::IndexMapper indexMapper( - m_inputImpl.dimensions(), kernel_dims, indices); - - LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); - break; - } - - default: { - EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - eigen_assert(m_buf); - eigen_assert(index < m_dimensions.TotalSize()); - return m_buf[index]; - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const - { - eigen_assert(m_buf); - eigen_assert(index < m_dimensions.TotalSize()); - return internal::ploadt(m_buf+index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost - // model. - const double kernel_size = m_kernelImpl.dimensions().TotalSize(); - // We ignore the use of fused multiply-add. - const double convolve_compute_cost = - TensorOpCost::AddCost() + TensorOpCost::MulCost(); - const double firstIndex_compute_cost = - NumDims * - (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + - TensorOpCost::DivCost()); - return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + - kernel_size * (m_inputImpl.costPerCoeff(vectorized) + - m_kernelImpl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, convolve_compute_cost, vectorized, - PacketSize)); - } - - private: - // No assignment (copies are needed by the kernels) - TensorEvaluator& operator = (const TensorEvaluator&); - - TensorEvaluator m_inputImpl; - TensorEvaluator m_kernelImpl; - KernelArgType m_kernelArg; - Indices m_indices; - Dimensions m_dimensions; - Scalar* m_buf; - const Scalar* m_kernel; - bool m_local_kernel; - - const GpuDevice& m_device; -}; -#endif - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h deleted file mode 100644 index 83c449c..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h +++ /dev/null @@ -1,212 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Rasmus Munk Larsen -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H -#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H - -namespace Eigen { - -/** \class TensorEvaluator - * \ingroup CXX11_Tensor_Module - * - * \brief A cost model used to limit the number of threads used for evaluating - * tensor expression. - * - */ - -// Class storing the cost of evaluating a tensor expression in terms of the -// estimated number of operand bytes loads, bytes stored, and compute cycles. -class TensorOpCost { - public: - // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple - // model based on minimal reciprocal throughput numbers from Intel or - // Agner Fog's tables would be better than what is there now. - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() { - return internal::functor_traits< - internal::scalar_product_op >::Cost; - } - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() { - return internal::functor_traits >::Cost; - } - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() { - return internal::functor_traits< - internal::scalar_quotient_op >::Cost; - } - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() { - return internal::functor_traits >::Cost; - } - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() { - return internal::functor_traits< - internal::scalar_cast_op >::Cost; - } - - EIGEN_DEVICE_FUNC - TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {} - EIGEN_DEVICE_FUNC - TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles) - : bytes_loaded_(bytes_loaded), - bytes_stored_(bytes_stored), - compute_cycles_(compute_cycles) {} - - EIGEN_DEVICE_FUNC - TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, - bool vectorized, double packet_size) - : bytes_loaded_(bytes_loaded), - bytes_stored_(bytes_stored), - compute_cycles_(vectorized ? compute_cycles / packet_size - : compute_cycles) { - eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded)); - eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored)); - eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const { - return bytes_loaded_; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const { - return bytes_stored_; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const { - return compute_cycles_; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost( - double load_cost, double store_cost, double compute_cost) const { - return load_cost * bytes_loaded_ + store_cost * bytes_stored_ + - compute_cost * compute_cycles_; - } - - // Drop memory access component. Intended for cases when memory accesses are - // sequential or are completely masked by computations. - EIGEN_DEVICE_FUNC void dropMemoryCost() { - bytes_loaded_ = 0; - bytes_stored_ = 0; - } - - // TODO(rmlarsen): Define min in terms of total cost, not elementwise. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin( - const TensorOpCost& rhs) const { - double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded()); - double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored()); - double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles()); - return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles); - } - - // TODO(rmlarsen): Define max in terms of total cost, not elementwise. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax( - const TensorOpCost& rhs) const { - double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded()); - double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored()); - double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles()); - return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=( - const TensorOpCost& rhs) { - bytes_loaded_ += rhs.bytes_loaded(); - bytes_stored_ += rhs.bytes_stored(); - compute_cycles_ += rhs.compute_cycles(); - return *this; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) { - bytes_loaded_ *= rhs; - bytes_stored_ *= rhs; - compute_cycles_ *= rhs; - return *this; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+( - TensorOpCost lhs, const TensorOpCost& rhs) { - lhs += rhs; - return lhs; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*( - TensorOpCost lhs, double rhs) { - lhs *= rhs; - return lhs; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*( - double lhs, TensorOpCost rhs) { - rhs *= lhs; - return rhs; - } - - friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) { - return os << "[bytes_loaded = " << tc.bytes_loaded() - << ", bytes_stored = " << tc.bytes_stored() - << ", compute_cycles = " << tc.compute_cycles() << "]"; - } - - private: - double bytes_loaded_; - double bytes_stored_; - double compute_cycles_; -}; - -// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads -// in [1:max_threads] instead of just switching multi-threading off for small -// work units. -template -class TensorCostModel { - public: - // Scaling from Eigen compute cost to device cycles. - static const int kDeviceCyclesPerComputeCycle = 1; - - // Costs in device cycles. - static const int kStartupCycles = 100000; - static const int kPerThreadCycles = 100000; - static const int kTaskSize = 40000; - - // Returns the number of threads in [1:max_threads] to use for - // evaluating an expression with the given output size and cost per - // coefficient. - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads( - double output_size, const TensorOpCost& cost_per_coeff, int max_threads) { - double cost = totalCost(output_size, cost_per_coeff); - int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; - return numext::mini(max_threads, numext::maxi(1, threads)); - } - - // taskSize assesses parallel task size. - // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task - // granularity needs to be increased to mitigate parallelization overheads. - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize( - double output_size, const TensorOpCost& cost_per_coeff) { - return totalCost(output_size, cost_per_coeff) / kTaskSize; - } - - private: - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost( - double output_size, const TensorOpCost& cost_per_coeff) { - // Cost of memory fetches from L2 cache. 64 is typical cache line size. - // 11 is L2 cache latency on Haswell. - // We don't know whether data is in L1, L2 or L3. But we are most interested - // in single-threaded computational time around 100us-10ms (smaller time - // is too small for parallelization, larger time is not intersting - // either because we are probably using all available threads already). - // And for the target time range, L2 seems to be what matters. Data set - // fitting into L1 is too small to take noticeable time. Data set fitting - // only into L3 presumably will take more than 10ms to load and process. - const double kLoadCycles = 1.0 / 64 * 11; - const double kStoreCycles = 1.0 / 64 * 11; - // Scaling from Eigen compute cost to device cycles. - return output_size * - cost_per_coeff.total_cost(kLoadCycles, kStoreCycles, - kDeviceCyclesPerComputeCycle); - } -}; - -} // namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h deleted file mode 100644 index e020d07..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ /dev/null @@ -1,313 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H -#define EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H - -namespace Eigen { - -/** \class TensorCustomUnaryOp - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor custom class. - * - * - */ -namespace internal { -template -struct traits > -{ - typedef typename XprType::Scalar Scalar; - typedef typename XprType::StorageKind StorageKind; - typedef typename XprType::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = traits::NumDimensions; - static const int Layout = traits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorCustomUnaryOp& type; -}; - -template -struct nested > -{ - typedef TensorCustomUnaryOp type; -}; - -} // end namespace internal - - - -template -class TensorCustomUnaryOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename internal::nested::type Nested; - typedef typename internal::traits::StorageKind StorageKind; - typedef typename internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomUnaryOp(const XprType& expr, const CustomUnaryFunc& func) - : m_expr(expr), m_func(func) {} - - EIGEN_DEVICE_FUNC - const CustomUnaryFunc& func() const { return m_func; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_expr; } - - protected: - typename XprType::Nested m_expr; - const CustomUnaryFunc m_func; -}; - - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorCustomUnaryOp ArgType; - typedef typename internal::traits::Index Index; - static const int NumDims = internal::traits::NumDimensions; - typedef DSizes Dimensions; - typedef typename internal::remove_const::type Scalar; - typedef typename internal::remove_const::type CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = false, - PacketAccess = (internal::packet_traits::size > 1), - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device) - : m_op(op), m_device(device), m_result(NULL) - { - m_dimensions = op.func().dimensions(op.expression()); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { - if (data) { - evalTo(data); - return false; - } else { - m_result = static_cast( - m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); - evalTo(m_result); - return true; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - if (m_result != NULL) { - m_device.deallocate(m_result); - m_result = NULL; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_result[index]; - } - - template - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { - return internal::ploadt(m_result + index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - // TODO(rmlarsen): Extend CustomOp API to return its cost estimate. - return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } - - protected: - EIGEN_DEVICE_FUNC void evalTo(Scalar* data) { - TensorMap > result( - data, m_dimensions); - m_op.func().eval(m_op.expression(), result, m_device); - } - - Dimensions m_dimensions; - const ArgType m_op; - const Device& m_device; - CoeffReturnType* m_result; -}; - - - -/** \class TensorCustomBinaryOp - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor custom class. - * - * - */ -namespace internal { -template -struct traits > -{ - typedef typename internal::promote_storage_type::ret Scalar; - typedef typename internal::promote_storage_type::ret CoeffReturnType; - typedef typename promote_storage_type::StorageKind, - typename traits::StorageKind>::ret StorageKind; - typedef typename promote_index_type::Index, - typename traits::Index>::type Index; - typedef typename LhsXprType::Nested LhsNested; - typedef typename RhsXprType::Nested RhsNested; - typedef typename remove_reference::type _LhsNested; - typedef typename remove_reference::type _RhsNested; - static const int NumDimensions = traits::NumDimensions; - static const int Layout = traits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorCustomBinaryOp& type; -}; - -template -struct nested > -{ - typedef TensorCustomBinaryOp type; -}; - -} // end namespace internal - - - -template -class TensorCustomBinaryOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename internal::traits::CoeffReturnType CoeffReturnType; - typedef typename internal::nested::type Nested; - typedef typename internal::traits::StorageKind StorageKind; - typedef typename internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const CustomBinaryFunc& func) - - : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_func(func) {} - - EIGEN_DEVICE_FUNC - const CustomBinaryFunc& func() const { return m_func; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - lhsExpression() const { return m_lhs_xpr; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - rhsExpression() const { return m_rhs_xpr; } - - protected: - typename LhsXprType::Nested m_lhs_xpr; - typename RhsXprType::Nested m_rhs_xpr; - const CustomBinaryFunc m_func; -}; - - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorCustomBinaryOp XprType; - typedef typename internal::traits::Index Index; - static const int NumDims = internal::traits::NumDimensions; - typedef DSizes Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename internal::remove_const::type CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = false, - PacketAccess = (internal::packet_traits::size > 1), - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_op(op), m_device(device), m_result(NULL) - { - m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression()); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { - if (data) { - evalTo(data); - return false; - } else { - m_result = static_cast(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); - evalTo(m_result); - return true; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - if (m_result != NULL) { - m_device.deallocate(m_result); - m_result = NULL; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_result[index]; - } - - template - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { - return internal::ploadt(m_result + index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - // TODO(rmlarsen): Extend CustomOp API to return its cost estimate. - return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } - - protected: - EIGEN_DEVICE_FUNC void evalTo(Scalar* data) { - TensorMap > result(data, m_dimensions); - m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device); - } - - Dimensions m_dimensions; - const XprType m_op; - const Device& m_device; - CoeffReturnType* m_result; -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h deleted file mode 100644 index 29e50a3..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h +++ /dev/null @@ -1,68 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H -#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H - -namespace Eigen { - -/** \class TensorDevice - * \ingroup CXX11_Tensor_Module - * - * \brief Pseudo expression providing an operator = that will evaluate its argument - * on the specified computing 'device' (GPU, thread pool, ...) - * - * Example: - * C.device(EIGEN_GPU) = A + B; - * - * Todo: operator *= and /=. - */ - -template class TensorDevice { - public: - TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {} - - template - EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { - typedef TensorAssignOp Assign; - Assign assign(m_expression, other); - internal::TensorExecutor::run(assign, m_device); - return *this; - } - - template - EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) { - typedef typename OtherDerived::Scalar Scalar; - typedef TensorCwiseBinaryOp, const ExpressionType, const OtherDerived> Sum; - Sum sum(m_expression, other); - typedef TensorAssignOp Assign; - Assign assign(m_expression, sum); - internal::TensorExecutor::run(assign, m_device); - return *this; - } - - template - EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) { - typedef typename OtherDerived::Scalar Scalar; - typedef TensorCwiseBinaryOp, const ExpressionType, const OtherDerived> Difference; - Difference difference(m_expression, other); - typedef TensorAssignOp Assign; - Assign assign(m_expression, difference); - internal::TensorExecutor::run(assign, m_device); - return *this; - } - - protected: - const DeviceType& m_device; - ExpressionType& m_expression; -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h deleted file mode 100644 index 4f5767b..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ /dev/null @@ -1,337 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H) -#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H - -namespace Eigen { - -static const int kCudaScratchSize = 1024; - -// This defines an interface that GPUDevice can take to use -// CUDA streams underneath. -class StreamInterface { - public: - virtual ~StreamInterface() {} - - virtual const cudaStream_t& stream() const = 0; - virtual const cudaDeviceProp& deviceProperties() const = 0; - - // Allocate memory on the actual device where the computation will run - virtual void* allocate(size_t num_bytes) const = 0; - virtual void deallocate(void* buffer) const = 0; - - // Return a scratchpad buffer of size 1k - virtual void* scratchpad() const = 0; - - // Return a semaphore. The semaphore is initially initialized to 0, and - // each kernel using it is responsible for resetting to 0 upon completion - // to maintain the invariant that the semaphore is always equal to 0 upon - // each kernel start. - virtual unsigned int* semaphore() const = 0; -}; - -static cudaDeviceProp* m_deviceProperties; -static bool m_devicePropInitialized = false; - -static void initializeDeviceProp() { - if (!m_devicePropInitialized) { - // Attempts to ensure proper behavior in the case of multiple threads - // calling this function simultaneously. This would be trivial to - // implement if we could use std::mutex, but unfortunately mutex don't - // compile with nvcc, so we resort to atomics and thread fences instead. - // Note that if the caller uses a compiler that doesn't support c++11 we - // can't ensure that the initialization is thread safe. -#if __cplusplus >= 201103L - static std::atomic first(true); - if (first.exchange(false)) { -#else - static bool first = true; - if (first) { - first = false; -#endif - // We're the first thread to reach this point. - int num_devices; - cudaError_t status = cudaGetDeviceCount(&num_devices); - if (status != cudaSuccess) { - std::cerr << "Failed to get the number of CUDA devices: " - << cudaGetErrorString(status) - << std::endl; - assert(status == cudaSuccess); - } - m_deviceProperties = new cudaDeviceProp[num_devices]; - for (int i = 0; i < num_devices; ++i) { - status = cudaGetDeviceProperties(&m_deviceProperties[i], i); - if (status != cudaSuccess) { - std::cerr << "Failed to initialize CUDA device #" - << i - << ": " - << cudaGetErrorString(status) - << std::endl; - assert(status == cudaSuccess); - } - } - -#if __cplusplus >= 201103L - std::atomic_thread_fence(std::memory_order_release); -#endif - m_devicePropInitialized = true; - } else { - // Wait for the other thread to inititialize the properties. - while (!m_devicePropInitialized) { -#if __cplusplus >= 201103L - std::atomic_thread_fence(std::memory_order_acquire); -#endif - sleep(1); - } - } - } -} - -static const cudaStream_t default_stream = cudaStreamDefault; - -class CudaStreamDevice : public StreamInterface { - public: - // Use the default stream on the current device - CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { - cudaGetDevice(&device_); - initializeDeviceProp(); - } - // Use the default stream on the specified device - CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) { - initializeDeviceProp(); - } - // Use the specified stream. Note that it's the - // caller responsibility to ensure that the stream can run on - // the specified device. If no device is specified the code - // assumes that the stream is associated to the current gpu device. - CudaStreamDevice(const cudaStream_t* stream, int device = -1) - : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) { - if (device < 0) { - cudaGetDevice(&device_); - } else { - int num_devices; - cudaError_t err = cudaGetDeviceCount(&num_devices); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - assert(device < num_devices); - device_ = device; - } - initializeDeviceProp(); - } - - virtual ~CudaStreamDevice() { - if (scratch_) { - deallocate(scratch_); - } - } - - const cudaStream_t& stream() const { return *stream_; } - const cudaDeviceProp& deviceProperties() const { - return m_deviceProperties[device_]; - } - virtual void* allocate(size_t num_bytes) const { - cudaError_t err = cudaSetDevice(device_); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - void* result; - err = cudaMalloc(&result, num_bytes); - assert(err == cudaSuccess); - assert(result != NULL); - return result; - } - virtual void deallocate(void* buffer) const { - cudaError_t err = cudaSetDevice(device_); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - assert(buffer != NULL); - err = cudaFree(buffer); - assert(err == cudaSuccess); - } - - virtual void* scratchpad() const { - if (scratch_ == NULL) { - scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int)); - } - return scratch_; - } - - virtual unsigned int* semaphore() const { - if (semaphore_ == NULL) { - char* scratch = static_cast(scratchpad()) + kCudaScratchSize; - semaphore_ = reinterpret_cast(scratch); - cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - } - return semaphore_; - } - - private: - const cudaStream_t* stream_; - int device_; - mutable void* scratch_; - mutable unsigned int* semaphore_; -}; - -struct GpuDevice { - // The StreamInterface is not owned: the caller is - // responsible for its initialization and eventual destruction. - explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { - eigen_assert(stream); - } - explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) { - eigen_assert(stream); - } - // TODO(bsteiner): This is an internal API, we should not expose it. - EIGEN_STRONG_INLINE const cudaStream_t& stream() const { - return stream_->stream(); - } - - EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { - return stream_->allocate(num_bytes); - } - - EIGEN_STRONG_INLINE void deallocate(void* buffer) const { - stream_->deallocate(buffer); - } - - EIGEN_STRONG_INLINE void* scratchpad() const { - return stream_->scratchpad(); - } - - EIGEN_STRONG_INLINE unsigned int* semaphore() const { - return stream_->semaphore(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { -#ifndef __CUDA_ARCH__ - cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, - stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); -#endif - } - - EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { - cudaError_t err = - cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - } - - EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { - cudaError_t err = - cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { -#ifndef __CUDA_ARCH__ - cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - assert(err == cudaSuccess); -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); -#endif - } - - EIGEN_STRONG_INLINE size_t numThreads() const { - // FIXME - return 32; - } - - EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { - // FIXME - return 48*1024; - } - - EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { - // We won't try to take advantage of the l2 cache for the time being, and - // there is no l3 cache on cuda devices. - return firstLevelCacheSize(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { -#if defined(__CUDACC__) && !defined(__CUDA_ARCH__) - cudaError_t err = cudaStreamSynchronize(stream_->stream()); - if (err != cudaSuccess) { - std::cerr << "Error detected in CUDA stream: " - << cudaGetErrorString(err) - << std::endl; - assert(err == cudaSuccess); - } -#else - assert(false && "The default device should be used instead to generate kernel code"); -#endif - } - - EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const { - return stream_->deviceProperties().multiProcessorCount; - } - EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const { - return stream_->deviceProperties().maxThreadsPerBlock; - } - EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const { - return stream_->deviceProperties().maxThreadsPerMultiProcessor; - } - EIGEN_STRONG_INLINE int sharedMemPerBlock() const { - return stream_->deviceProperties().sharedMemPerBlock; - } - EIGEN_STRONG_INLINE int majorDeviceVersion() const { - return stream_->deviceProperties().major; - } - EIGEN_STRONG_INLINE int minorDeviceVersion() const { - return stream_->deviceProperties().minor; - } - - EIGEN_STRONG_INLINE int maxBlocks() const { - return max_blocks_; - } - - // This function checks if the CUDA runtime recorded an error for the - // underlying stream device. - inline bool ok() const { -#ifdef __CUDACC__ - cudaError_t error = cudaStreamQuery(stream_->stream()); - return (error == cudaSuccess) || (error == cudaErrorNotReady); -#else - return false; -#endif - } - - private: - const StreamInterface* stream_; - int max_blocks_; -}; - -#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ - (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ - assert(cudaGetLastError() == cudaSuccess); - - -// FIXME: Should be device and kernel specific. -#ifdef __CUDACC__ -static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { -#ifndef __CUDA_ARCH__ - cudaError_t status = cudaDeviceSetSharedMemConfig(config); - EIGEN_UNUSED_VARIABLE(status) - assert(status == cudaSuccess); -#else - EIGEN_UNUSED_VARIABLE(config) -#endif -} -#endif - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h deleted file mode 100644 index 9d14139..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h +++ /dev/null @@ -1,81 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H -#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H - - -namespace Eigen { - -// Default device for the machine (typically a single cpu core) -struct DefaultDevice { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { - return internal::aligned_malloc(num_bytes); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { - internal::aligned_free(buffer); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { - ::memcpy(dst, src, n); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { - memcpy(dst, src, n); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { - memcpy(dst, src, n); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { - ::memset(buffer, c, n); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { -#ifndef __CUDA_ARCH__ - // Running on the host CPU - return 1; -#else - // Running on a CUDA device - return 32; -#endif - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { -#ifndef __CUDA_ARCH__ - // Running on the host CPU - return l1CacheSize(); -#else - // Running on a CUDA device, return the amount of shared memory available. - return 48*1024; -#endif - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { -#ifndef __CUDA_ARCH__ - // Running single threaded on the host CPU - return l3CacheSize(); -#else - // Running on a CUDA device - return firstLevelCacheSize(); -#endif - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { -#ifndef __CUDA_ARCH__ - // Running single threaded on the host CPU - // Should return an enum that encodes the ISA supported by the CPU - return 1; -#else - // Running on a CUDA device - return __CUDA_ARCH__ / 100; -#endif - } -}; - -} // namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h deleted file mode 100644 index 7c03989..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ /dev/null @@ -1,122 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: -// Copyright (C) 2016 Benoit Steiner - -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H) -#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H - -namespace Eigen { -struct SyclDevice { - /// class members - /// sycl queue - mutable cl::sycl::queue m_queue; - /// std::map is the container used to make sure that we create only one buffer - /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. - /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. - mutable std::map> buffer_map; - /// creating device by using selector - template SyclDevice(dev_Selector s) - : -#ifdef EIGEN_EXCEPTIONS - m_queue(cl::sycl::queue(s, [=](cl::sycl::exception_list l) { - for (const auto& e : l) { - try { - std::rethrow_exception(e); - } catch (cl::sycl::exception e) { - std::cout << e.what() << std::endl; - } - } - })) -#else - m_queue(cl::sycl::queue(s)) -#endif - {} - // destructor - ~SyclDevice() { deallocate_all(); } - - template void deallocate(T *p) const { - auto it = buffer_map.find(p); - if (it != buffer_map.end()) { - buffer_map.erase(it); - internal::aligned_free(p); - } - } - void deallocate_all() const { - std::map>::iterator it=buffer_map.begin(); - while (it!=buffer_map.end()) { - auto p=it->first; - buffer_map.erase(it); - internal::aligned_free(const_cast(p)); - it=buffer_map.begin(); - } - buffer_map.clear(); - } - - /// creation of sycl accessor for a buffer. This function first tries to find - /// the buffer in the buffer_map. If found it gets the accessor from it, if not, - ///the function then adds an entry by creating a sycl buffer for that particular pointer. - template inline cl::sycl::accessor - get_sycl_accessor(size_t num_bytes, cl::sycl::handler &cgh, const T * ptr) const { - return (get_sycl_buffer(num_bytes, ptr)->template get_access(cgh)); - } - - template inline std::pair>::iterator,bool> add_sycl_buffer(const T *ptr, size_t num_bytes) const { - using Type = cl::sycl::buffer; - std::pair>::iterator,bool> ret = buffer_map.insert(std::pair>(ptr, std::shared_ptr(new Type(cl::sycl::range<1>(num_bytes)), - [](void *dataMem) { delete static_cast(dataMem); }))); - (static_cast(buffer_map.at(ptr).get()))->set_final_data(nullptr); - return ret; - } - - template inline cl::sycl::buffer* get_sycl_buffer(size_t num_bytes,const T * ptr) const { - return static_cast*>(add_sycl_buffer(ptr, num_bytes).first->second.get()); - } - - /// allocating memory on the cpu - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void *allocate(size_t) const { - return internal::aligned_malloc(8); - } - - // some runtime conditions that can be applied here - bool isDeviceSuitable() const { return true; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const { - ::memcpy(dst, src, n); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(T *dst, const T *src, size_t n) const { - auto host_acc= (static_cast*>(add_sycl_buffer(dst, n).first->second.get()))-> template get_access(); - memcpy(host_acc.get_pointer(), src, n); - } - /// whith the current implementation of sycl, the data is copied twice from device to host. This will be fixed soon. - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(T *dst, const T *src, size_t n) const { - auto it = buffer_map.find(src); - if (it != buffer_map.end()) { - auto host_acc= (static_cast*>(it->second.get()))-> template get_access(); - memcpy(dst,host_acc.get_pointer(), n); - } else{ - eigen_assert("no device memory found. The memory might be destroyed before creation"); - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void *buffer, int c, size_t n) const { - ::memset(buffer, c, n); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { - return 1; - } -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h deleted file mode 100644 index 17f0466..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ /dev/null @@ -1,282 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#if defined(EIGEN_USE_THREADS) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H) -#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H - -namespace Eigen { - -// Use the SimpleThreadPool by default. We'll switch to the new non blocking -// thread pool later. -#ifndef EIGEN_USE_SIMPLE_THREAD_POOL -template using ThreadPoolTempl = NonBlockingThreadPoolTempl; -typedef NonBlockingThreadPool ThreadPool; -#else -template using ThreadPoolTempl = SimpleThreadPoolTempl; -typedef SimpleThreadPool ThreadPool; -#endif - - -// Barrier is an object that allows one or more threads to wait until -// Notify has been called a specified number of times. -class Barrier { - public: - Barrier(unsigned int count) : state_(count << 1), notified_(false) { - eigen_assert(((count << 1) >> 1) == count); - } - ~Barrier() { - eigen_assert((state_>>1) == 0); - } - - void Notify() { - unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2; - if (v != 1) { - eigen_assert(((v + 2) & ~1) != 0); - return; // either count has not dropped to 0, or waiter is not waiting - } - std::unique_lock l(mu_); - eigen_assert(!notified_); - notified_ = true; - cv_.notify_all(); - } - - void Wait() { - unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel); - if ((v >> 1) == 0) return; - std::unique_lock l(mu_); - while (!notified_) { - cv_.wait(l); - } - } - - private: - std::mutex mu_; - std::condition_variable cv_; - std::atomic state_; // low bit is waiter flag - bool notified_; -}; - - -// Notification is an object that allows a user to to wait for another -// thread to signal a notification that an event has occurred. -// -// Multiple threads can wait on the same Notification object, -// but only one caller must call Notify() on the object. -struct Notification : Barrier { - Notification() : Barrier(1) {}; -}; - - -// Runs an arbitrary function and then calls Notify() on the passed in -// Notification. -template struct FunctionWrapperWithNotification -{ - static void run(Notification* n, Function f, Args... args) { - f(args...); - if (n) { - n->Notify(); - } - } -}; - -template struct FunctionWrapperWithBarrier -{ - static void run(Barrier* b, Function f, Args... args) { - f(args...); - if (b) { - b->Notify(); - } - } -}; - -template -static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) { - if (n) { - n->Wait(); - } -} - - -// Build a thread pool device on top the an existing pool of threads. -struct ThreadPoolDevice { - // The ownership of the thread pool remains with the caller. - ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores) : pool_(pool), num_threads_(num_cores) { } - - EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { - return internal::aligned_malloc(num_bytes); - } - - EIGEN_STRONG_INLINE void deallocate(void* buffer) const { - internal::aligned_free(buffer); - } - - EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { - ::memcpy(dst, src, n); - } - EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { - memcpy(dst, src, n); - } - EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { - memcpy(dst, src, n); - } - - EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { - ::memset(buffer, c, n); - } - - EIGEN_STRONG_INLINE int numThreads() const { - return num_threads_; - } - - EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { - return l1CacheSize(); - } - - EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { - // The l3 cache size is shared between all the cores. - return l3CacheSize() / num_threads_; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { - // Should return an enum that encodes the ISA supported by the CPU - return 1; - } - - template - EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const { - Notification* n = new Notification(); - pool_->Schedule(std::bind(&FunctionWrapperWithNotification::run, n, f, args...)); - return n; - } - - template - EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, - Function&& f, - Args&&... args) const { - pool_->Schedule(std::bind( - &FunctionWrapperWithBarrier::run, b, f, args...)); - } - - template - EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const { - pool_->Schedule(std::bind(f, args...)); - } - - // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if - // called from one of the threads in pool_. Returns -1 otherwise. - EIGEN_STRONG_INLINE int currentThreadId() const { - return pool_->CurrentThreadId(); - } - - // parallelFor executes f with [0, n) arguments in parallel and waits for - // completion. F accepts a half-open interval [first, last). - // Block size is choosen based on the iteration cost and resulting parallel - // efficiency. If block_align is not nullptr, it is called to round up the - // block size. - void parallelFor(Index n, const TensorOpCost& cost, - std::function block_align, - std::function f) const { - typedef TensorCostModel CostModel; - if (n <= 1 || numThreads() == 1 || - CostModel::numThreads(n, cost, static_cast(numThreads())) == 1) { - f(0, n); - return; - } - - // Calculate block size based on (1) the iteration cost and (2) parallel - // efficiency. We want blocks to be not too small to mitigate - // parallelization overheads; not too large to mitigate tail - // effect and potential load imbalance and we also want number - // of blocks to be evenly dividable across threads. - - double block_size_f = 1.0 / CostModel::taskSize(1, cost); - const Index max_oversharding_factor = 4; - Index block_size = numext::mini( - n, numext::maxi(divup(n, max_oversharding_factor * numThreads()), - block_size_f)); - const Index max_block_size = numext::mini(n, 2 * block_size); - if (block_align) { - Index new_block_size = block_align(block_size); - eigen_assert(new_block_size >= block_size); - block_size = numext::mini(n, new_block_size); - } - Index block_count = divup(n, block_size); - // Calculate parallel efficiency as fraction of total CPU time used for - // computations: - double max_efficiency = - static_cast(block_count) / - (divup(block_count, numThreads()) * numThreads()); - // Now try to increase block size up to max_block_size as long as it - // doesn't decrease parallel efficiency. - for (Index prev_block_count = block_count; - max_efficiency < 1.0 && prev_block_count > 1;) { - // This is the next block size that divides size into a smaller number - // of blocks than the current block_size. - Index coarser_block_size = divup(n, prev_block_count - 1); - if (block_align) { - Index new_block_size = block_align(coarser_block_size); - eigen_assert(new_block_size >= coarser_block_size); - coarser_block_size = numext::mini(n, new_block_size); - } - if (coarser_block_size > max_block_size) { - break; // Reached max block size. Stop. - } - // Recalculate parallel efficiency. - const Index coarser_block_count = divup(n, coarser_block_size); - eigen_assert(coarser_block_count < prev_block_count); - prev_block_count = coarser_block_count; - const double coarser_efficiency = - static_cast(coarser_block_count) / - (divup(coarser_block_count, numThreads()) * numThreads()); - if (coarser_efficiency + 0.01 >= max_efficiency) { - // Taking it. - block_size = coarser_block_size; - block_count = coarser_block_count; - if (max_efficiency < coarser_efficiency) { - max_efficiency = coarser_efficiency; - } - } - } - - // Recursively divide size into halves until we reach block_size. - // Division code rounds mid to block_size, so we are guaranteed to get - // block_count leaves that do actual computations. - Barrier barrier(static_cast(block_count)); - std::function handleRange; - handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) { - if (last - first <= block_size) { - // Single block or less, execute directly. - f(first, last); - barrier.Notify(); - return; - } - // Split into halves and submit to the pool. - Index mid = first + divup((last - first) / 2, block_size) * block_size; - pool_->Schedule([=, &handleRange]() { handleRange(mid, last); }); - pool_->Schedule([=, &handleRange]() { handleRange(first, mid); }); - }; - handleRange(0, n); - barrier.Wait(); - } - - // Convenience wrapper for parallelFor that does not align blocks. - void parallelFor(Index n, const TensorOpCost& cost, - std::function f) const { - parallelFor(n, cost, nullptr, std::move(f)); - } - - private: - ThreadPoolInterface* pool_; - int num_threads_; -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h deleted file mode 100644 index 1a30e45..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h +++ /dev/null @@ -1,236 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H -#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H - -namespace Eigen { - -/** \internal - * - * \class TensorDimensionList - * \ingroup CXX11_Tensor_Module - * - * \brief Special case of tensor index list used to list all the dimensions of a tensor of rank n. - * - * \sa Tensor - */ - -template struct DimensionList { - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - const Index operator[] (const Index i) const { return i; } -}; - -namespace internal { - -template struct array_size > { - static const size_t value = Rank; -}; -template struct array_size > { - static const size_t value = Rank; -}; - -template const Index array_get(DimensionList&) { - return n; -} -template const Index array_get(const DimensionList&) { - return n; -} - - -#if EIGEN_HAS_CONSTEXPR -template -struct index_known_statically_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) { - return true; - } -}; -template -struct index_known_statically_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) { - return true; - } -}; - -template -struct all_indices_known_statically_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run() { - return true; - } -}; -template -struct all_indices_known_statically_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run() { - return true; - } -}; - -template -struct indices_statically_known_to_increase_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run() { - return true; - } -}; -template -struct indices_statically_known_to_increase_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run() { - return true; - } -}; - -template -struct index_statically_eq_impl > { - static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return i == value; - } -}; -template -struct index_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return i == value; - } -}; - -template -struct index_statically_ne_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return i != value; - } -}; -template -struct index_statically_ne_impl > { - static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return i != value; - } -}; - -template -struct index_statically_gt_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return i > value; - } -}; -template -struct index_statically_gt_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return i > value; - } -}; - -template -struct index_statically_lt_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return i < value; - } -}; -template -struct index_statically_lt_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return i < value; - } -}; - -#else -template -struct index_known_statically_impl > { - EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run(const DenseIndex) { - return true; - } -}; -template -struct index_known_statically_impl > { - EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run(const DenseIndex) { - return true; - } -}; - -template -struct all_indices_known_statically_impl > { - EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run() { - return true; - } -}; -template -struct all_indices_known_statically_impl > { - EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run() { - return true; - } -}; - -template -struct indices_statically_known_to_increase_impl > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() { - return true; - } -}; -template -struct indices_statically_known_to_increase_impl > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() { - return true; - } -}; - -template -struct index_statically_eq_impl > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { - return false; - } -}; -template -struct index_statically_eq_impl > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { - return false; - } -}; - -template -struct index_statically_ne_impl > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex){ - return false; - } -}; -template -struct index_statically_ne_impl > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { - return false; - } -}; - -template -struct index_statically_gt_impl > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { - return false; - } -}; -template -struct index_statically_gt_impl > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { - return false; - } -}; - -template -struct index_statically_lt_impl > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { - return false; - } -}; -template -struct index_statically_lt_impl > { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { - return false; - } -}; -#endif - -} // end namespace internal -} // end namespace Eigen - - -#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h deleted file mode 100644 index 451940d..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ /dev/null @@ -1,428 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H -#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H - - -namespace Eigen { - -/** \internal - * - * \class TensorDimensions - * \ingroup CXX11_Tensor_Module - * - * \brief Set of classes used to encode and store the dimensions of a Tensor. - * - * The Sizes class encodes as part of the type the number of dimensions and the - * sizes corresponding to each dimension. It uses no storage space since it is - * entirely known at compile time. - * The DSizes class is its dynamic sibling: the number of dimensions is known - * at compile time but the sizes are set during execution. - * - * \sa Tensor - */ - -// Boilerplate code -namespace internal { - -template struct dget { - static const std::size_t value = get::value; -}; - - -template -struct fixed_size_tensor_index_linearization_helper -{ - template EIGEN_DEVICE_FUNC - static inline Index run(array const& indices, - const Dimensions& dimensions) - { - return array_get(indices) + - dget::value * - fixed_size_tensor_index_linearization_helper::run(indices, dimensions); - } -}; - -template -struct fixed_size_tensor_index_linearization_helper -{ - template EIGEN_DEVICE_FUNC - static inline Index run(array const&, const Dimensions&) - { - return 0; - } -}; - -template -struct fixed_size_tensor_index_extraction_helper -{ - template EIGEN_DEVICE_FUNC - static inline Index run(const Index index, - const Dimensions& dimensions) - { - const Index mult = (index == n-1) ? 1 : 0; - return array_get(dimensions) * mult + - fixed_size_tensor_index_extraction_helper::run(index, dimensions); - } -}; - -template -struct fixed_size_tensor_index_extraction_helper -{ - template EIGEN_DEVICE_FUNC - static inline Index run(const Index, - const Dimensions&) - { - return 0; - } - }; - -} // end namespace internal - - -// Fixed size -#ifndef EIGEN_EMULATE_CXX11_META_H -template -struct Sizes : internal::numeric_list { - typedef internal::numeric_list Base; - static const std::ptrdiff_t total_size = internal::arg_prod(Indices...); - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const { - return Base::count; - } - - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t TotalSize() { - return internal::arg_prod(Indices...); - } - - EIGEN_DEVICE_FUNC Sizes() { } - template - explicit EIGEN_DEVICE_FUNC Sizes(const array& /*indices*/) { - // todo: add assertion - } -#if EIGEN_HAS_VARIADIC_TEMPLATES - template EIGEN_DEVICE_FUNC Sizes(DenseIndex...) { } - explicit EIGEN_DEVICE_FUNC Sizes(std::initializer_list /*l*/) { - // todo: add assertion - } -#endif - - template Sizes& operator = (const T& /*other*/) { - // add assertion failure if the size of other is different - return *this; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::size_t index) const { - return internal::fixed_size_tensor_index_extraction_helper::run(index, *this); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - size_t IndexOfColMajor(const array& indices) const { - return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - size_t IndexOfRowMajor(const array& indices) const { - return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); - } -}; - -namespace internal { -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes&) { - return Sizes::total_size; -} -} - -#else - -template -struct non_zero_size { - typedef internal::type2val type; -}; -template <> -struct non_zero_size<0> { - typedef internal::null_type type; -}; - -template struct Sizes { - typedef typename internal::make_type_list::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type >::type Base; - static const size_t count = Base::count; - static const std::size_t total_size = internal::arg_prod::value; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const { - return count; - } - - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() { - return internal::arg_prod::value; - } - - Sizes() { } - template - explicit Sizes(const array& /*indices*/) { - // todo: add assertion - } - template Sizes& operator = (const T& /*other*/) { - // add assertion failure if the size of other is different - return *this; - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template Sizes(DenseIndex... /*indices*/) { } - explicit Sizes(std::initializer_list) { - // todo: add assertion - } -#else - EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex) { - } - EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex) { - } - EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex) { - } - EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) { - } - EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) { - } -#endif - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index operator[] (const Index index) const { - switch (index) { - case 0: - return internal::get<0, Base>::value; - case 1: - return internal::get<1, Base>::value; - case 2: - return internal::get<2, Base>::value; - case 3: - return internal::get<3, Base>::value; - case 4: - return internal::get<4, Base>::value; - default: - eigen_assert(false && "index overflow"); - return static_cast(-1); - } - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - size_t IndexOfColMajor(const array& indices) const { - return internal::fixed_size_tensor_index_linearization_helper::run(indices, *reinterpret_cast(this)); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - size_t IndexOfRowMajor(const array& indices) const { - return internal::fixed_size_tensor_index_linearization_helper::run(indices, *reinterpret_cast(this)); - } -}; - -namespace internal { -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes&) { - return Sizes::total_size; -} -} - -#endif - -// Boilerplate -namespace internal { -template -struct tensor_index_linearization_helper -{ - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Index run(array const& indices, array const& dimensions) - { - return array_get(indices) + - array_get(dimensions) * - tensor_index_linearization_helper::run(indices, dimensions); - } -}; - -template -struct tensor_index_linearization_helper -{ - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Index run(array const& indices, array const&) - { - return array_get(indices); - } -}; -} // end namespace internal - - - -// Dynamic size -template -struct DSizes : array { - typedef array Base; - static const int count = NumDims; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const { - return NumDims; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex TotalSize() const { - return (NumDims == 0) ? 1 : internal::array_prod(*static_cast(this)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DSizes() { - for (int i = 0 ; i < NumDims; ++i) { - (*this)[i] = 0; - } - } - EIGEN_DEVICE_FUNC explicit DSizes(const array& a) : Base(a) { } - - EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) { - eigen_assert(NumDims == 1); - (*this)[0] = i0; - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) { - EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 2 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE) - } -#else - EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1) { - eigen_assert(NumDims == 2); - (*this)[0] = i0; - (*this)[1] = i1; - } - EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) { - eigen_assert(NumDims == 3); - (*this)[0] = i0; - (*this)[1] = i1; - (*this)[2] = i2; - } - EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) { - eigen_assert(NumDims == 4); - (*this)[0] = i0; - (*this)[1] = i1; - (*this)[2] = i2; - (*this)[3] = i3; - } - EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) { - eigen_assert(NumDims == 5); - (*this)[0] = i0; - (*this)[1] = i1; - (*this)[2] = i2; - (*this)[3] = i3; - (*this)[4] = i4; - } -#endif - - EIGEN_DEVICE_FUNC DSizes& operator = (const array& other) { - *static_cast(this) = other; - return *this; - } - - // A constexpr would be so much better here - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfColMajor(const array& indices) const { - return internal::tensor_index_linearization_helper::run(indices, *static_cast(this)); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfRowMajor(const array& indices) const { - return internal::tensor_index_linearization_helper::run(indices, *static_cast(this)); - } -}; - - - - -// Boilerplate -namespace internal { -template -struct tensor_vsize_index_linearization_helper -{ - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Index run(array const& indices, std::vector const& dimensions) - { - return array_get(indices) + - array_get(dimensions) * - tensor_vsize_index_linearization_helper::run(indices, dimensions); - } -}; - -template -struct tensor_vsize_index_linearization_helper -{ - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Index run(array const& indices, std::vector const&) - { - return array_get(indices); - } -}; -} // end namespace internal - - -namespace internal { - -template struct array_size > { - static const size_t value = NumDims; -}; -template struct array_size > { - static const size_t value = NumDims; -}; -#ifndef EIGEN_EMULATE_CXX11_META_H -template struct array_size > { -static const std::ptrdiff_t value = Sizes::count; -}; -template struct array_size > { -static const std::ptrdiff_t value = Sizes::count; -}; -template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes&) { - return get >::value; -} -template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<>&) { - eigen_assert(false && "should never be called"); - return -1; -} -#else -template struct array_size > { - static const size_t value = Sizes::count; -}; -template struct array_size > { - static const size_t value = Sizes::count; -}; -template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes&) { - return get::Base>::value; -} - -#endif - - -template -struct sizes_match_below_dim { - static EIGEN_DEVICE_FUNC inline bool run(Dims1&, Dims2&) { - return false; - } -}; -template -struct sizes_match_below_dim { - static EIGEN_DEVICE_FUNC inline bool run(Dims1& dims1, Dims2& dims2) { - return (array_get(dims1) == array_get(dims2)) & - sizes_match_below_dim::run(dims1, dims2); - } -}; -template -struct sizes_match_below_dim { - static EIGEN_DEVICE_FUNC inline bool run(Dims1&, Dims2&) { - return true; - } -}; - -} // end namespace internal - - -template -EIGEN_DEVICE_FUNC bool dimensions_match(Dims1& dims1, Dims2& dims2) { - return internal::sizes_match_below_dim::value, internal::array_size::value>::run(dims1, dims2); -} - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h deleted file mode 100644 index 0698713..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ /dev/null @@ -1,181 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H -#define EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H - -namespace Eigen { - -/** \class TensorForcedEval - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor reshaping class. - * - * - */ -namespace internal { -template class MakePointer_> -struct traits > -{ - // Type promotion to handle the case where the types of the lhs and the rhs are different. - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; - - enum { - Flags = 0 - }; - template - struct MakePointer { - // Intermediate typedef to workaround MSVC issue. - typedef MakePointer_ MakePointerT; - typedef typename MakePointerT::Type Type; - }; -}; - -template class MakePointer_> -struct eval, Eigen::Dense> -{ - typedef const TensorEvalToOp& type; -}; - -template class MakePointer_> -struct nested, 1, typename eval >::type> -{ - typedef TensorEvalToOp type; -}; - -} // end namespace internal - - - - -template class MakePointer_> -class TensorEvalToOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename internal::remove_const::type CoeffReturnType; - typedef typename MakePointer_::Type PointerType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(PointerType buffer, const XprType& expr) - : m_xpr(expr), m_buffer(buffer) {} - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - EIGEN_DEVICE_FUNC PointerType buffer() const { return m_buffer; } - - protected: - typename XprType::Nested m_xpr; - PointerType m_buffer; -}; - - - -template class MakePointer_> -struct TensorEvaluator, Device> -{ - typedef TensorEvalToOp XprType; - typedef typename ArgType::Scalar Scalar; - typedef typename TensorEvaluator::Dimensions Dimensions; - typedef typename XprType::Index Index; - typedef typename internal::remove_const::type CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = true - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_device(device), - m_buffer(op.buffer()), m_op(op), m_expression(op.expression()) - { } - - // Used for accessor extraction in SYCL Managed TensorMap: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& op() const { - return m_op; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() { - } - - typedef typename internal::traits >::template MakePointer::Type DevicePointer; - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(DevicePointer scalar) { - EIGEN_UNUSED_VARIABLE(scalar); - eigen_assert(scalar == NULL); - return m_impl.evalSubExprsIfNeeded(m_buffer); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { - m_buffer[i] = m_impl.coeff(i); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { - internal::pstoret(m_buffer + i, m_impl.template packet::IsAligned ? Aligned : Unaligned>(i)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return m_buffer[index]; - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return internal::ploadt(m_buffer + index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - // We assume that evalPacket or evalScalar is called to perform the - // assignment and account for the cost of the write here. - return m_impl.costPerCoeff(vectorized) + - TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC DevicePointer data() const { return m_buffer; } - ArgType expression() const { return m_expression; } - - /// required by sycl in order to extract the accessor - const TensorEvaluator& impl() const { return m_impl; } - /// added for sycl in order to construct the buffer from the sycl device - const Device& device() const{return m_device;} - - private: - TensorEvaluator m_impl; - const Device& m_device; - DevicePointer m_buffer; - const XprType& m_op; - const ArgType m_expression; -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h deleted file mode 100644 index 834ce07..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ /dev/null @@ -1,633 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H -#define EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H - -namespace Eigen { - -/** \class TensorEvaluator - * \ingroup CXX11_Tensor_Module - * - * \brief The tensor evaluator classes. - * - * These classes are responsible for the evaluation of the tensor expression. - * - * TODO: add support for more types of expressions, in particular expressions - * leading to lvalues (slicing, reshaping, etc...) - */ - -// Generic evaluator -template -struct TensorEvaluator -{ - typedef typename Derived::Index Index; - typedef typename Derived::Scalar Scalar; - typedef typename Derived::Scalar CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef typename Derived::Dimensions Dimensions; - - // NumDimensions is -1 for variable dim tensors - static const int NumCoords = internal::traits::NumDimensions > 0 ? - internal::traits::NumDimensions : 0; - - enum { - IsAligned = Derived::IsAligned, - PacketAccess = (internal::unpacket_traits::size > 1), - Layout = Derived::Layout, - CoordAccess = NumCoords > 0, - RawAccess = true - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) - : m_data(const_cast::template MakePointer::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m) - { } - - // Used for accessor extraction in SYCL Managed TensorMap: - const Derived& derived() const { return m_impl; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* dest) { - if (dest) { - m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize()); - return false; - } - return true; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - eigen_assert(m_data); - return m_data[index]; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { - eigen_assert(m_data); - return m_data[index]; - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PacketReturnType packet(Index index) const - { - return internal::ploadt(m_data + index); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) - { - return internal::pstoret(m_data + index, x); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { - eigen_assert(m_data); - if (static_cast(Layout) == static_cast(ColMajor)) { - return m_data[m_dims.IndexOfColMajor(coords)]; - } else { - return m_data[m_dims.IndexOfRowMajor(coords)]; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array& coords) { - eigen_assert(m_data); - if (static_cast(Layout) == static_cast(ColMajor)) { - return m_data[m_dims.IndexOfColMajor(coords)]; - } else { - return m_data[m_dims.IndexOfRowMajor(coords)]; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, - internal::unpacket_traits::size); - } - - EIGEN_DEVICE_FUNC typename internal::traits::template MakePointer::Type data() const { return m_data; } - - /// required by sycl in order to construct sycl buffer from raw pointer - const Device& device() const{return m_device;} - - protected: - typename internal::traits::template MakePointer::Type m_data; - Dimensions m_dims; - const Device& m_device; - const Derived& m_impl; -}; - -namespace { -template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -T loadConstant(const T* address) { - return *address; -} -// Use the texture cache on CUDA devices whenever possible -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 -template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -float loadConstant(const float* address) { - return __ldg(address); -} -template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -double loadConstant(const double* address) { - return __ldg(address); -} -template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -Eigen::half loadConstant(const Eigen::half* address) { - return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x))); -} -#endif -} - - -// Default evaluator for rvalues -template -struct TensorEvaluator -{ - typedef typename Derived::Index Index; - typedef typename Derived::Scalar Scalar; - typedef typename Derived::Scalar CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef typename Derived::Dimensions Dimensions; - - // NumDimensions is -1 for variable dim tensors - static const int NumCoords = internal::traits::NumDimensions > 0 ? - internal::traits::NumDimensions : 0; - - enum { - IsAligned = Derived::IsAligned, - PacketAccess = (internal::unpacket_traits::size > 1), - Layout = Derived::Layout, - CoordAccess = NumCoords > 0, - RawAccess = true - }; - - // Used for accessor extraction in SYCL Managed TensorMap: - const Derived& derived() const { return m_impl; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) - : m_data(m.data()), m_dims(m.dimensions()), m_device(device), m_impl(m) - { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { - if (!NumTraits::type>::RequireInitialization && data) { - m_device.memcpy((void*)data, m_data, m_dims.TotalSize() * sizeof(Scalar)); - return false; - } - return true; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - eigen_assert(m_data); - return loadConstant(m_data+index); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PacketReturnType packet(Index index) const - { - return internal::ploadt_ro(m_data + index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { - eigen_assert(m_data); - const Index index = (static_cast(Layout) == static_cast(ColMajor)) ? m_dims.IndexOfColMajor(coords) - : m_dims.IndexOfRowMajor(coords); - return loadConstant(m_data+index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, - internal::unpacket_traits::size); - } - - EIGEN_DEVICE_FUNC typename internal::traits::template MakePointer::Type data() const { return m_data; } - - /// added for sycl in order to construct the buffer from the sycl device - const Device& device() const{return m_device;} - - protected: - typename internal::traits::template MakePointer::Type m_data; - Dimensions m_dims; - const Device& m_device; - const Derived& m_impl; -}; - - - - -// -------------------- CwiseNullaryOp -------------------- - -template -struct TensorEvaluator, Device> -{ - typedef TensorCwiseNullaryOp XprType; - - enum { - IsAligned = true, - PacketAccess = internal::functor_traits::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC - TensorEvaluator(const XprType& op, const Device& device) - : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper() - { } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename internal::traits::Scalar CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - typedef typename TensorEvaluator::Dimensions Dimensions; - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } - - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const - { - return m_wrapper(m_functor, index); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return m_wrapper.template packetOp(m_functor, index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, - internal::unpacket_traits::size); - } - - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } - - /// required by sycl in order to extract the accessor - const TensorEvaluator& impl() const { return m_argImpl; } - /// required by sycl in order to extract the accessor - NullaryOp functor() const { return m_functor; } - - - private: - const NullaryOp m_functor; - TensorEvaluator m_argImpl; - const internal::nullary_wrapper m_wrapper; -}; - - - -// -------------------- CwiseUnaryOp -------------------- - -template -struct TensorEvaluator, Device> -{ - typedef TensorCwiseUnaryOp XprType; - - enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - : m_functor(op.functor()), - m_argImpl(op.nestedExpression(), device) - { } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename internal::traits::Scalar CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - typedef typename TensorEvaluator::Dimensions Dimensions; - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { - m_argImpl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_argImpl.cleanup(); - } - - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const - { - return m_functor(m_argImpl.coeff(index)); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return m_functor.packetOp(m_argImpl.template packet(index)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - const double functor_cost = internal::functor_traits::Cost; - return m_argImpl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } - - /// required by sycl in order to extract the accessor - const TensorEvaluator & impl() const { return m_argImpl; } - /// added for sycl in order to construct the buffer from sycl device - UnaryOp functor() const { return m_functor; } - - - private: - const UnaryOp m_functor; - TensorEvaluator m_argImpl; -}; - - -// -------------------- CwiseBinaryOp -------------------- - -template -struct TensorEvaluator, Device> -{ - typedef TensorCwiseBinaryOp XprType; - - enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & - internal::functor_traits::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - : m_functor(op.functor()), - m_leftImpl(op.lhsExpression(), device), - m_rightImpl(op.rhsExpression(), device) - { - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout) || internal::traits::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); - eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); - } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename internal::traits::Scalar CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - typedef typename TensorEvaluator::Dimensions Dimensions; - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const - { - // TODO: use right impl instead if right impl dimensions are known at compile time. - return m_leftImpl.dimensions(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { - m_leftImpl.evalSubExprsIfNeeded(NULL); - m_rightImpl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_leftImpl.cleanup(); - m_rightImpl.cleanup(); - } - - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const - { - return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index)); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return m_functor.packetOp(m_leftImpl.template packet(index), m_rightImpl.template packet(index)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - const double functor_cost = internal::functor_traits::Cost; - return m_leftImpl.costPerCoeff(vectorized) + - m_rightImpl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& left_impl() const { return m_leftImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& right_impl() const { return m_rightImpl; } - /// required by sycl in order to extract the accessor - BinaryOp functor() const { return m_functor; } - - private: - const BinaryOp m_functor; - TensorEvaluator m_leftImpl; - TensorEvaluator m_rightImpl; -}; - -// -------------------- CwiseTernaryOp -------------------- - -template -struct TensorEvaluator, Device> -{ - typedef TensorCwiseTernaryOp XprType; - - enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & - internal::functor_traits::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - : m_functor(op.functor()), - m_arg1Impl(op.arg1Expression(), device), - m_arg2Impl(op.arg2Expression(), device), - m_arg3Impl(op.arg3Expression(), device) - { - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout) || internal::traits::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); - - EIGEN_STATIC_ASSERT((internal::is_same::StorageKind, - typename internal::traits::StorageKind>::value), - STORAGE_KIND_MUST_MATCH) - EIGEN_STATIC_ASSERT((internal::is_same::StorageKind, - typename internal::traits::StorageKind>::value), - STORAGE_KIND_MUST_MATCH) - EIGEN_STATIC_ASSERT((internal::is_same::Index, - typename internal::traits::Index>::value), - STORAGE_INDEX_MUST_MATCH) - EIGEN_STATIC_ASSERT((internal::is_same::Index, - typename internal::traits::Index>::value), - STORAGE_INDEX_MUST_MATCH) - - eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) && dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions())); - } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename internal::traits::Scalar CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - typedef typename TensorEvaluator::Dimensions Dimensions; - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const - { - // TODO: use arg2 or arg3 dimensions if they are known at compile time. - return m_arg1Impl.dimensions(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { - m_arg1Impl.evalSubExprsIfNeeded(NULL); - m_arg2Impl.evalSubExprsIfNeeded(NULL); - m_arg3Impl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_arg1Impl.cleanup(); - m_arg2Impl.cleanup(); - m_arg3Impl.cleanup(); - } - - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const - { - return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index)); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return m_functor.packetOp(m_arg1Impl.template packet(index), - m_arg2Impl.template packet(index), - m_arg3Impl.template packet(index)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - const double functor_cost = internal::functor_traits::Cost; - return m_arg1Impl.costPerCoeff(vectorized) + - m_arg2Impl.costPerCoeff(vectorized) + - m_arg3Impl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } - - /// required by sycl in order to extract the accessor - const TensorEvaluator & arg1Impl() const { return m_arg1Impl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& arg2Impl() const { return m_arg2Impl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& arg3Impl() const { return m_arg3Impl; } - - private: - const TernaryOp m_functor; - TensorEvaluator m_arg1Impl; - TensorEvaluator m_arg2Impl; - TensorEvaluator m_arg3Impl; -}; - - -// -------------------- SelectOp -------------------- - -template -struct TensorEvaluator, Device> -{ - typedef TensorSelectOp XprType; - typedef typename XprType::Scalar Scalar; - - enum { - IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & - internal::packet_traits::HasBlend, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - : m_condImpl(op.ifExpression(), device), - m_thenImpl(op.thenExpression(), device), - m_elseImpl(op.elseExpression(), device) - { - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); - eigen_assert(dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions())); - eigen_assert(dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions())); - } - - typedef typename XprType::Index Index; - typedef typename internal::traits::Scalar CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - typedef typename TensorEvaluator::Dimensions Dimensions; - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const - { - // TODO: use then or else impl instead if they happen to be known at compile time. - return m_condImpl.dimensions(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { - m_condImpl.evalSubExprsIfNeeded(NULL); - m_thenImpl.evalSubExprsIfNeeded(NULL); - m_elseImpl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_condImpl.cleanup(); - m_thenImpl.cleanup(); - m_elseImpl.cleanup(); - } - - EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const - { - return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index); - } - template - EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const - { - internal::Selector select; - for (Index i = 0; i < PacketSize; ++i) { - select.select[i] = m_condImpl.coeff(index+i); - } - return internal::pblend(select, - m_thenImpl.template packet(index), - m_elseImpl.template packet(index)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - return m_condImpl.costPerCoeff(vectorized) + - m_thenImpl.costPerCoeff(vectorized) - .cwiseMax(m_elseImpl.costPerCoeff(vectorized)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; } - /// required by sycl in order to extract the accessor - const TensorEvaluator & cond_impl() const { return m_condImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& then_impl() const { return m_thenImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& else_impl() const { return m_elseImpl; } - - private: - TensorEvaluator m_condImpl; - TensorEvaluator m_thenImpl; - TensorEvaluator m_elseImpl; -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h deleted file mode 100644 index f01d77c..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ /dev/null @@ -1,288 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H -#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H - -namespace Eigen { - -/** \class TensorExecutor - * \ingroup CXX11_Tensor_Module - * - * \brief The tensor executor class. - * - * This class is responsible for launch the evaluation of the expression on - * the specified computing device. - */ -namespace internal { - -// Default strategy: the expression is evaluated with a single cpu thread. -template -class TensorExecutor -{ - public: - typedef typename Expression::Index Index; - EIGEN_DEVICE_FUNC - static inline void run(const Expression& expr, const Device& device = Device()) - { - TensorEvaluator evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { - const Index size = array_prod(evaluator.dimensions()); - for (Index i = 0; i < size; ++i) { - evaluator.evalScalar(i); - } - } - evaluator.cleanup(); - } -}; - - -template -class TensorExecutor -{ - public: - typedef typename Expression::Index Index; - EIGEN_DEVICE_FUNC - static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) - { - TensorEvaluator evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { - const Index size = array_prod(evaluator.dimensions()); - const int PacketSize = unpacket_traits::PacketReturnType>::size; - // Give the compiler a strong hint to unroll the loop. But don't insist - // on unrolling, because if the function is expensive the compiler should not - // unroll the loop at the expense of inlining. - const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; - for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { - for (Index j = 0; j < 4; j++) { - evaluator.evalPacket(i + j * PacketSize); - } - } - const Index VectorizedSize = (size / PacketSize) * PacketSize; - for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) { - evaluator.evalPacket(i); - } - for (Index i = VectorizedSize; i < size; ++i) { - evaluator.evalScalar(i); - } - } - evaluator.cleanup(); - } -}; - - - -// Multicore strategy: the index space is partitioned and each partition is executed on a single core -#ifdef EIGEN_USE_THREADS -template -struct EvalRange { - static void run(Evaluator* evaluator_in, const Index first, const Index last) { - Evaluator evaluator = *evaluator_in; - eigen_assert(last >= first); - for (Index i = first; i < last; ++i) { - evaluator.evalScalar(i); - } - } - - static Index alignBlockSize(Index size) { - return size; - } -}; - -template -struct EvalRange { - static const int PacketSize = unpacket_traits::size; - - static void run(Evaluator* evaluator_in, const Index first, const Index last) { - Evaluator evaluator = *evaluator_in; - eigen_assert(last >= first); - Index i = first; - if (last - first >= PacketSize) { - eigen_assert(first % PacketSize == 0); - Index last_chunk_offset = last - 4 * PacketSize; - // Give the compiler a strong hint to unroll the loop. But don't insist - // on unrolling, because if the function is expensive the compiler should not - // unroll the loop at the expense of inlining. - for (; i <= last_chunk_offset; i += 4*PacketSize) { - for (Index j = 0; j < 4; j++) { - evaluator.evalPacket(i + j * PacketSize); - } - } - last_chunk_offset = last - PacketSize; - for (; i <= last_chunk_offset; i += PacketSize) { - evaluator.evalPacket(i); - } - } - for (; i < last; ++i) { - evaluator.evalScalar(i); - } - } - - static Index alignBlockSize(Index size) { - // Align block size to packet size and account for unrolling in run above. - if (size >= 16 * PacketSize) { - return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1); - } - // Aligning to 4 * PacketSize would increase block size by more than 25%. - return (size + PacketSize - 1) & ~(PacketSize - 1); - } -}; - -template -class TensorExecutor { - public: - typedef typename Expression::Index Index; - static inline void run(const Expression& expr, const ThreadPoolDevice& device) - { - typedef TensorEvaluator Evaluator; - Evaluator evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) - { - const Index size = array_prod(evaluator.dimensions()); -#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL) - device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), - EvalRange::alignBlockSize, - [&evaluator](Index first, Index last) { - EvalRange::run(&evaluator, first, last); - }); -#else - size_t num_threads = device.numThreads(); - if (num_threads > 1) { - num_threads = TensorCostModel::numThreads( - size, evaluator.costPerCoeff(Vectorizable), num_threads); - } - if (num_threads == 1) { - EvalRange::run(&evaluator, 0, size); - } else { - const Index PacketSize = Vectorizable ? unpacket_traits::size : 1; - Index blocksz = std::ceil(static_cast(size)/num_threads) + PacketSize - 1; - const Index blocksize = numext::maxi(PacketSize, (blocksz - (blocksz % PacketSize))); - const Index numblocks = size / blocksize; - - Barrier barrier(numblocks); - for (int i = 0; i < numblocks; ++i) { - device.enqueue_with_barrier( - &barrier, &EvalRange::run, - &evaluator, i * blocksize, (i + 1) * blocksize); - } - if (numblocks * blocksize < size) { - EvalRange::run( - &evaluator, numblocks * blocksize, size); - } - barrier.Wait(); - } -#endif // defined(!EIGEN_USE_SIMPLE_THREAD_POOL) - } - evaluator.cleanup(); - } -}; -#endif // EIGEN_USE_THREADS - - -// GPU: the evaluation of the expression is offloaded to a GPU. -#if defined(EIGEN_USE_GPU) - -template -class TensorExecutor { - public: - typedef typename Expression::Index Index; - static void run(const Expression& expr, const GpuDevice& device); -}; - - -#if defined(__CUDACC__) -template -struct EigenMetaKernelEval { - static __device__ EIGEN_ALWAYS_INLINE - void run(Evaluator& eval, Index first, Index last, Index step_size) { - for (Index i = first; i < last; i += step_size) { - eval.evalScalar(i); - } - } -}; - -template -struct EigenMetaKernelEval { - static __device__ EIGEN_ALWAYS_INLINE - void run(Evaluator& eval, Index first, Index last, Index step_size) { - const Index PacketSize = unpacket_traits::size; - const Index vectorized_size = (last / PacketSize) * PacketSize; - const Index vectorized_step_size = step_size * PacketSize; - - // Use the vector path - for (Index i = first * PacketSize; i < vectorized_size; - i += vectorized_step_size) { - eval.evalPacket(i); - } - for (Index i = vectorized_size + first; i < last; i += step_size) { - eval.evalScalar(i); - } - } -}; - -template -__global__ void -__launch_bounds__(1024) -EigenMetaKernel(Evaluator eval, Index size) { - - const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; - const Index step_size = blockDim.x * gridDim.x; - - const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned; - EigenMetaKernelEval::run(eval, first_index, size, step_size); -} - -/*static*/ -template -inline void TensorExecutor::run( - const Expression& expr, const GpuDevice& device) { - TensorEvaluator evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) { - const int block_size = device.maxCudaThreadsPerBlock(); - const int max_blocks = device.getNumCudaMultiProcessors() * - device.maxCudaThreadsPerMultiProcessor() / block_size; - const Index size = array_prod(evaluator.dimensions()); - // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. - const int num_blocks = numext::maxi(numext::mini(max_blocks, divup(size, block_size)), 1); - - LAUNCH_CUDA_KERNEL( - (EigenMetaKernel, Index>), - num_blocks, block_size, 0, device, evaluator, size); - } - evaluator.cleanup(); -} - -#endif // __CUDACC__ -#endif // EIGEN_USE_GPU - -// SYCL Executor policy -#ifdef EIGEN_USE_SYCL - -template -class TensorExecutor { -public: - static inline void run(const Expression &expr, const SyclDevice &device) { - // call TensorSYCL module - TensorSycl::run(expr, device); - } -}; - -#endif - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h deleted file mode 100644 index 85dfc7a..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ /dev/null @@ -1,371 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXPR_H -#define EIGEN_CXX11_TENSOR_TENSOR_EXPR_H - -namespace Eigen { - -/** \class TensorExpr - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor expression classes. - * - * The TensorCwiseNullaryOp class applies a nullary operators to an expression. - * This is typically used to generate constants. - * - * The TensorCwiseUnaryOp class represents an expression where a unary operator - * (e.g. cwiseSqrt) is applied to an expression. - * - * The TensorCwiseBinaryOp class represents an expression where a binary - * operator (e.g. addition) is applied to a lhs and a rhs expression. - * - */ -namespace internal { -template -struct traits > - : traits -{ - typedef traits XprTraits; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::Nested XprTypeNested; - typedef typename remove_reference::type _XprTypeNested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; - - enum { - Flags = 0 - }; -}; - -} // end namespace internal - - - -template -class TensorCwiseNullaryOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef TensorCwiseNullaryOp Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseNullaryOp(const XprType& xpr, const NullaryOp& func = NullaryOp()) - : m_xpr(xpr), m_functor(func) {} - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - nestedExpression() const { return m_xpr; } - - EIGEN_DEVICE_FUNC - const NullaryOp& functor() const { return m_functor; } - - protected: - typename XprType::Nested m_xpr; - const NullaryOp m_functor; -}; - - - -namespace internal { -template -struct traits > - : traits -{ - // TODO(phli): Add InputScalar, InputPacket. Check references to - // current Scalar/Packet to see if the intent is Input or Output. - typedef typename result_of::type Scalar; - typedef traits XprTraits; - typedef typename XprType::Nested XprTypeNested; - typedef typename remove_reference::type _XprTypeNested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorCwiseUnaryOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorCwiseUnaryOp type; -}; - -} // end namespace internal - - - -template -class TensorCwiseUnaryOp : public TensorBase, ReadOnlyAccessors> -{ - public: - // TODO(phli): Add InputScalar, InputPacket. Check references to - // current Scalar/Packet to see if the intent is Input or Output. - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef Scalar CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp()) - : m_xpr(xpr), m_functor(func) {} - - EIGEN_DEVICE_FUNC - const UnaryOp& functor() const { return m_functor; } - - /** \returns the nested expression */ - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - nestedExpression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; - const UnaryOp m_functor; -}; - - -namespace internal { -template -struct traits > -{ - // Type promotion to handle the case where the types of the lhs and the rhs - // are different. - // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to - // current Scalar/Packet to see if the intent is Inputs or Output. - typedef typename result_of< - BinaryOp(typename LhsXprType::Scalar, - typename RhsXprType::Scalar)>::type Scalar; - typedef traits XprTraits; - typedef typename promote_storage_type< - typename traits::StorageKind, - typename traits::StorageKind>::ret StorageKind; - typedef typename promote_index_type< - typename traits::Index, - typename traits::Index>::type Index; - typedef typename LhsXprType::Nested LhsNested; - typedef typename RhsXprType::Nested RhsNested; - typedef typename remove_reference::type _LhsNested; - typedef typename remove_reference::type _RhsNested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; - - enum { - Flags = 0 - }; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorCwiseBinaryOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorCwiseBinaryOp type; -}; - -} // end namespace internal - - - -template -class TensorCwiseBinaryOp : public TensorBase, ReadOnlyAccessors> -{ - public: - // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to - // current Scalar/Packet to see if the intent is Inputs or Output. - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef Scalar CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const BinaryOp& func = BinaryOp()) - : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {} - - EIGEN_DEVICE_FUNC - const BinaryOp& functor() const { return m_functor; } - - /** \returns the nested expressions */ - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - lhsExpression() const { return m_lhs_xpr; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - rhsExpression() const { return m_rhs_xpr; } - - protected: - typename LhsXprType::Nested m_lhs_xpr; - typename RhsXprType::Nested m_rhs_xpr; - const BinaryOp m_functor; -}; - - -namespace internal { -template -struct traits > -{ - // Type promotion to handle the case where the types of the args are different. - typedef typename result_of< - TernaryOp(typename Arg1XprType::Scalar, - typename Arg2XprType::Scalar, - typename Arg3XprType::Scalar)>::type Scalar; - typedef traits XprTraits; - typedef typename traits::StorageKind StorageKind; - typedef typename traits::Index Index; - typedef typename Arg1XprType::Nested Arg1Nested; - typedef typename Arg2XprType::Nested Arg2Nested; - typedef typename Arg3XprType::Nested Arg3Nested; - typedef typename remove_reference::type _Arg1Nested; - typedef typename remove_reference::type _Arg2Nested; - typedef typename remove_reference::type _Arg3Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; - - enum { - Flags = 0 - }; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorCwiseTernaryOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorCwiseTernaryOp type; -}; - -} // end namespace internal - - - -template -class TensorCwiseTernaryOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef Scalar CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseTernaryOp(const Arg1XprType& arg1, const Arg2XprType& arg2, const Arg3XprType& arg3, const TernaryOp& func = TernaryOp()) - : m_arg1_xpr(arg1), m_arg2_xpr(arg2), m_arg3_xpr(arg3), m_functor(func) {} - - EIGEN_DEVICE_FUNC - const TernaryOp& functor() const { return m_functor; } - - /** \returns the nested expressions */ - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - arg1Expression() const { return m_arg1_xpr; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - arg2Expression() const { return m_arg2_xpr; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - arg3Expression() const { return m_arg3_xpr; } - - protected: - typename Arg1XprType::Nested m_arg1_xpr; - typename Arg2XprType::Nested m_arg2_xpr; - typename Arg3XprType::Nested m_arg3_xpr; - const TernaryOp m_functor; -}; - - -namespace internal { -template -struct traits > - : traits -{ - typedef typename traits::Scalar Scalar; - typedef traits XprTraits; - typedef typename promote_storage_type::StorageKind, - typename traits::StorageKind>::ret StorageKind; - typedef typename promote_index_type::Index, - typename traits::Index>::type Index; - typedef typename IfXprType::Nested IfNested; - typedef typename ThenXprType::Nested ThenNested; - typedef typename ElseXprType::Nested ElseNested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorSelectOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorSelectOp type; -}; - -} // end namespace internal - - -template -class TensorSelectOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename internal::promote_storage_type::ret CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC - TensorSelectOp(const IfXprType& a_condition, - const ThenXprType& a_then, - const ElseXprType& a_else) - : m_condition(a_condition), m_then(a_then), m_else(a_else) - { } - - EIGEN_DEVICE_FUNC - const IfXprType& ifExpression() const { return m_condition; } - - EIGEN_DEVICE_FUNC - const ThenXprType& thenExpression() const { return m_then; } - - EIGEN_DEVICE_FUNC - const ElseXprType& elseExpression() const { return m_else; } - - protected: - typename IfXprType::Nested m_condition; - typename ThenXprType::Nested m_then; - typename ElseXprType::Nested m_else; -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_EXPR_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h deleted file mode 100644 index 08eb559..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ /dev/null @@ -1,651 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Jianwei Cui -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_FFT_H -#define EIGEN_CXX11_TENSOR_TENSOR_FFT_H - -// This code requires the ability to initialize arrays of constant -// values directly inside a class. -#if __cplusplus >= 201103L || EIGEN_COMP_MSVC >= 1900 - -namespace Eigen { - -/** \class TensorFFT - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor FFT class. - * - * TODO: - * Vectorize the Cooley Tukey and the Bluestein algorithm - * Add support for multithreaded evaluation - * Improve the performance on GPU - */ - -template struct MakeComplex { - template - EIGEN_DEVICE_FUNC - T operator() (const T& val) const { return val; } -}; - -template <> struct MakeComplex { - template - EIGEN_DEVICE_FUNC - std::complex operator() (const T& val) const { return std::complex(val, 0); } -}; - -template <> struct MakeComplex { - template - EIGEN_DEVICE_FUNC - std::complex operator() (const std::complex& val) const { return val; } -}; - -template struct PartOf { - template T operator() (const T& val) const { return val; } -}; - -template <> struct PartOf { - template T operator() (const std::complex& val) const { return val.real(); } -}; - -template <> struct PartOf { - template T operator() (const std::complex& val) const { return val.imag(); } -}; - -namespace internal { -template -struct traits > : public traits { - typedef traits XprTraits; - typedef typename NumTraits::Real RealScalar; - typedef typename std::complex ComplexScalar; - typedef typename XprTraits::Scalar InputScalar; - typedef typename conditional::type OutputScalar; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> { - typedef const TensorFFTOp& type; -}; - -template -struct nested, 1, typename eval >::type> { - typedef TensorFFTOp type; -}; - -} // end namespace internal - -template -class TensorFFTOp : public TensorBase, ReadOnlyAccessors> { - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename std::complex ComplexScalar; - typedef typename internal::conditional::type OutputScalar; - typedef OutputScalar CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFFTOp(const XprType& expr, const FFT& fft) - : m_xpr(expr), m_fft(fft) {} - - EIGEN_DEVICE_FUNC - const FFT& fft() const { return m_fft; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& expression() const { - return m_xpr; - } - - protected: - typename XprType::Nested m_xpr; - const FFT m_fft; -}; - -// Eval as rvalue -template -struct TensorEvaluator, Device> { - typedef TensorFFTOp XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size::Dimensions>::value; - typedef DSizes Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename std::complex ComplexScalar; - typedef typename TensorEvaluator::Dimensions InputDimensions; - typedef internal::traits XprTraits; - typedef typename XprTraits::Scalar InputScalar; - typedef typename internal::conditional::type OutputScalar; - typedef OutputScalar CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = false, - PacketAccess = true, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) { - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - for (int i = 0; i < NumDims; ++i) { - eigen_assert(input_dims[i] > 0); - m_dimensions[i] = input_dims[i]; - } - - if (static_cast(Layout) == static_cast(ColMajor)) { - m_strides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; - } - } else { - m_strides[NumDims - 1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; - } - } - m_size = m_dimensions.TotalSize(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { - return m_dimensions; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(OutputScalar* data) { - m_impl.evalSubExprsIfNeeded(NULL); - if (data) { - evalToBuf(data); - return false; - } else { - m_data = (CoeffReturnType*)m_device.allocate(sizeof(CoeffReturnType) * m_size); - evalToBuf(m_data); - return true; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - if (m_data) { - m_device.deallocate(m_data); - m_data = NULL; - } - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const { - return m_data[index]; - } - - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType - packet(Index index) const { - return internal::ploadt(m_data + index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; } - - - private: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(OutputScalar* data) { - const bool write_to_out = internal::is_same::value; - ComplexScalar* buf = write_to_out ? (ComplexScalar*)data : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * m_size); - - for (Index i = 0; i < m_size; ++i) { - buf[i] = MakeComplex::value>()(m_impl.coeff(i)); - } - - for (size_t i = 0; i < m_fft.size(); ++i) { - Index dim = m_fft[i]; - eigen_assert(dim >= 0 && dim < NumDims); - Index line_len = m_dimensions[dim]; - eigen_assert(line_len >= 1); - ComplexScalar* line_buf = (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * line_len); - const bool is_power_of_two = isPowerOfTwo(line_len); - const Index good_composite = is_power_of_two ? 0 : findGoodComposite(line_len); - const Index log_len = is_power_of_two ? getLog2(line_len) : getLog2(good_composite); - - ComplexScalar* a = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite); - ComplexScalar* b = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite); - ComplexScalar* pos_j_base_powered = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * (line_len + 1)); - if (!is_power_of_two) { - // Compute twiddle factors - // t_n = exp(sqrt(-1) * pi * n^2 / line_len) - // for n = 0, 1,..., line_len-1. - // For n > 2 we use the recurrence t_n = t_{n-1}^2 / t_{n-2} * t_1^2 - pos_j_base_powered[0] = ComplexScalar(1, 0); - if (line_len > 1) { - const RealScalar pi_over_len(EIGEN_PI / line_len); - const ComplexScalar pos_j_base = ComplexScalar( - std::cos(pi_over_len), std::sin(pi_over_len)); - pos_j_base_powered[1] = pos_j_base; - if (line_len > 2) { - const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base; - for (int j = 2; j < line_len + 1; ++j) { - pos_j_base_powered[j] = pos_j_base_powered[j - 1] * - pos_j_base_powered[j - 1] / - pos_j_base_powered[j - 2] * pos_j_base_sq; - } - } - } - } - - for (Index partial_index = 0; partial_index < m_size / line_len; ++partial_index) { - const Index base_offset = getBaseOffsetFromIndex(partial_index, dim); - - // get data into line_buf - const Index stride = m_strides[dim]; - if (stride == 1) { - memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar)); - } else { - Index offset = base_offset; - for (int j = 0; j < line_len; ++j, offset += stride) { - line_buf[j] = buf[offset]; - } - } - - // processs the line - if (is_power_of_two) { - processDataLineCooleyTukey(line_buf, line_len, log_len); - } - else { - processDataLineBluestein(line_buf, line_len, good_composite, log_len, a, b, pos_j_base_powered); - } - - // write back - if (FFTDir == FFT_FORWARD && stride == 1) { - memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar)); - } else { - Index offset = base_offset; - const ComplexScalar div_factor = ComplexScalar(1.0 / line_len, 0); - for (int j = 0; j < line_len; ++j, offset += stride) { - buf[offset] = (FFTDir == FFT_FORWARD) ? line_buf[j] : line_buf[j] * div_factor; - } - } - } - m_device.deallocate(line_buf); - if (!is_power_of_two) { - m_device.deallocate(a); - m_device.deallocate(b); - m_device.deallocate(pos_j_base_powered); - } - } - - if(!write_to_out) { - for (Index i = 0; i < m_size; ++i) { - data[i] = PartOf()(buf[i]); - } - m_device.deallocate(buf); - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool isPowerOfTwo(Index x) { - eigen_assert(x > 0); - return !(x & (x - 1)); - } - - // The composite number for padding, used in Bluestein's FFT algorithm - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index findGoodComposite(Index n) { - Index i = 2; - while (i < 2 * n - 1) i *= 2; - return i; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index getLog2(Index m) { - Index log2m = 0; - while (m >>= 1) log2m++; - return log2m; - } - - // Call Cooley Tukey algorithm directly, data length must be power of 2 - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineCooleyTukey(ComplexScalar* line_buf, Index line_len, Index log_len) { - eigen_assert(isPowerOfTwo(line_len)); - scramble_FFT(line_buf, line_len); - compute_1D_Butterfly(line_buf, line_len, log_len); - } - - // Call Bluestein's FFT algorithm, m is a good composite number greater than (2 * n - 1), used as the padding length - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineBluestein(ComplexScalar* line_buf, Index line_len, Index good_composite, Index log_len, ComplexScalar* a, ComplexScalar* b, const ComplexScalar* pos_j_base_powered) { - Index n = line_len; - Index m = good_composite; - ComplexScalar* data = line_buf; - - for (Index i = 0; i < n; ++i) { - if(FFTDir == FFT_FORWARD) { - a[i] = data[i] * numext::conj(pos_j_base_powered[i]); - } - else { - a[i] = data[i] * pos_j_base_powered[i]; - } - } - for (Index i = n; i < m; ++i) { - a[i] = ComplexScalar(0, 0); - } - - for (Index i = 0; i < n; ++i) { - if(FFTDir == FFT_FORWARD) { - b[i] = pos_j_base_powered[i]; - } - else { - b[i] = numext::conj(pos_j_base_powered[i]); - } - } - for (Index i = n; i < m - n; ++i) { - b[i] = ComplexScalar(0, 0); - } - for (Index i = m - n; i < m; ++i) { - if(FFTDir == FFT_FORWARD) { - b[i] = pos_j_base_powered[m-i]; - } - else { - b[i] = numext::conj(pos_j_base_powered[m-i]); - } - } - - scramble_FFT(a, m); - compute_1D_Butterfly(a, m, log_len); - - scramble_FFT(b, m); - compute_1D_Butterfly(b, m, log_len); - - for (Index i = 0; i < m; ++i) { - a[i] *= b[i]; - } - - scramble_FFT(a, m); - compute_1D_Butterfly(a, m, log_len); - - //Do the scaling after ifft - for (Index i = 0; i < m; ++i) { - a[i] /= m; - } - - for (Index i = 0; i < n; ++i) { - if(FFTDir == FFT_FORWARD) { - data[i] = a[i] * numext::conj(pos_j_base_powered[i]); - } - else { - data[i] = a[i] * pos_j_base_powered[i]; - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void scramble_FFT(ComplexScalar* data, Index n) { - eigen_assert(isPowerOfTwo(n)); - Index j = 1; - for (Index i = 1; i < n; ++i){ - if (j > i) { - std::swap(data[j-1], data[i-1]); - } - Index m = n >> 1; - while (m >= 2 && j > m) { - j -= m; - m >>= 1; - } - j += m; - } - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_2(ComplexScalar* data) { - ComplexScalar tmp = data[1]; - data[1] = data[0] - data[1]; - data[0] += tmp; - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_4(ComplexScalar* data) { - ComplexScalar tmp[4]; - tmp[0] = data[0] + data[1]; - tmp[1] = data[0] - data[1]; - tmp[2] = data[2] + data[3]; - if (Dir == FFT_FORWARD) { - tmp[3] = ComplexScalar(0.0, -1.0) * (data[2] - data[3]); - } else { - tmp[3] = ComplexScalar(0.0, 1.0) * (data[2] - data[3]); - } - data[0] = tmp[0] + tmp[2]; - data[1] = tmp[1] + tmp[3]; - data[2] = tmp[0] - tmp[2]; - data[3] = tmp[1] - tmp[3]; - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_8(ComplexScalar* data) { - ComplexScalar tmp_1[8]; - ComplexScalar tmp_2[8]; - - tmp_1[0] = data[0] + data[1]; - tmp_1[1] = data[0] - data[1]; - tmp_1[2] = data[2] + data[3]; - if (Dir == FFT_FORWARD) { - tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, -1); - } else { - tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, 1); - } - tmp_1[4] = data[4] + data[5]; - tmp_1[5] = data[4] - data[5]; - tmp_1[6] = data[6] + data[7]; - if (Dir == FFT_FORWARD) { - tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, -1); - } else { - tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, 1); - } - tmp_2[0] = tmp_1[0] + tmp_1[2]; - tmp_2[1] = tmp_1[1] + tmp_1[3]; - tmp_2[2] = tmp_1[0] - tmp_1[2]; - tmp_2[3] = tmp_1[1] - tmp_1[3]; - tmp_2[4] = tmp_1[4] + tmp_1[6]; -// SQRT2DIV2 = sqrt(2)/2 -#define SQRT2DIV2 0.7071067811865476 - if (Dir == FFT_FORWARD) { - tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, -SQRT2DIV2); - tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, -1); - tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, -SQRT2DIV2); - } else { - tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, SQRT2DIV2); - tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, 1); - tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, SQRT2DIV2); - } - data[0] = tmp_2[0] + tmp_2[4]; - data[1] = tmp_2[1] + tmp_2[5]; - data[2] = tmp_2[2] + tmp_2[6]; - data[3] = tmp_2[3] + tmp_2[7]; - data[4] = tmp_2[0] - tmp_2[4]; - data[5] = tmp_2[1] - tmp_2[5]; - data[6] = tmp_2[2] - tmp_2[6]; - data[7] = tmp_2[3] - tmp_2[7]; - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_1D_merge( - ComplexScalar* data, Index n, Index n_power_of_2) { - // Original code: - // RealScalar wtemp = std::sin(M_PI/n); - // RealScalar wpi = -std::sin(2 * M_PI/n); - const RealScalar wtemp = m_sin_PI_div_n_LUT[n_power_of_2]; - const RealScalar wpi = (Dir == FFT_FORWARD) - ? m_minus_sin_2_PI_div_n_LUT[n_power_of_2] - : -m_minus_sin_2_PI_div_n_LUT[n_power_of_2]; - - const ComplexScalar wp(wtemp, wpi); - const ComplexScalar wp_one = wp + ComplexScalar(1, 0); - const ComplexScalar wp_one_2 = wp_one * wp_one; - const ComplexScalar wp_one_3 = wp_one_2 * wp_one; - const ComplexScalar wp_one_4 = wp_one_3 * wp_one; - const Index n2 = n / 2; - ComplexScalar w(1.0, 0.0); - for (Index i = 0; i < n2; i += 4) { - ComplexScalar temp0(data[i + n2] * w); - ComplexScalar temp1(data[i + 1 + n2] * w * wp_one); - ComplexScalar temp2(data[i + 2 + n2] * w * wp_one_2); - ComplexScalar temp3(data[i + 3 + n2] * w * wp_one_3); - w = w * wp_one_4; - - data[i + n2] = data[i] - temp0; - data[i] += temp0; - - data[i + 1 + n2] = data[i + 1] - temp1; - data[i + 1] += temp1; - - data[i + 2 + n2] = data[i + 2] - temp2; - data[i + 2] += temp2; - - data[i + 3 + n2] = data[i + 3] - temp3; - data[i + 3] += temp3; - } - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_1D_Butterfly( - ComplexScalar* data, Index n, Index n_power_of_2) { - eigen_assert(isPowerOfTwo(n)); - if (n > 8) { - compute_1D_Butterfly(data, n / 2, n_power_of_2 - 1); - compute_1D_Butterfly(data + n / 2, n / 2, n_power_of_2 - 1); - butterfly_1D_merge(data, n, n_power_of_2); - } else if (n == 8) { - butterfly_8(data); - } else if (n == 4) { - butterfly_4(data); - } else if (n == 2) { - butterfly_2(data); - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getBaseOffsetFromIndex(Index index, Index omitted_dim) const { - Index result = 0; - - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 1; i > omitted_dim; --i) { - const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim]; - const Index idx = index / partial_m_stride; - index -= idx * partial_m_stride; - result += idx * m_strides[i]; - } - result += index; - } - else { - for (Index i = 0; i < omitted_dim; ++i) { - const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim]; - const Index idx = index / partial_m_stride; - index -= idx * partial_m_stride; - result += idx * m_strides[i]; - } - result += index; - } - // Value of index_coords[omitted_dim] is not determined to this step - return result; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getIndexFromOffset(Index base, Index omitted_dim, Index offset) const { - Index result = base + offset * m_strides[omitted_dim] ; - return result; - } - - protected: - Index m_size; - const FFT& m_fft; - Dimensions m_dimensions; - array m_strides; - TensorEvaluator m_impl; - CoeffReturnType* m_data; - const Device& m_device; - - // This will support a maximum FFT size of 2^32 for each dimension - // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2; - const RealScalar m_sin_PI_div_n_LUT[32] = { - RealScalar(0.0), - RealScalar(-2), - RealScalar(-0.999999999999999), - RealScalar(-0.292893218813453), - RealScalar(-0.0761204674887130), - RealScalar(-0.0192147195967696), - RealScalar(-0.00481527332780311), - RealScalar(-0.00120454379482761), - RealScalar(-3.01181303795779e-04), - RealScalar(-7.52981608554592e-05), - RealScalar(-1.88247173988574e-05), - RealScalar(-4.70619042382852e-06), - RealScalar(-1.17654829809007e-06), - RealScalar(-2.94137117780840e-07), - RealScalar(-7.35342821488550e-08), - RealScalar(-1.83835707061916e-08), - RealScalar(-4.59589268710903e-09), - RealScalar(-1.14897317243732e-09), - RealScalar(-2.87243293150586e-10), - RealScalar( -7.18108232902250e-11), - RealScalar(-1.79527058227174e-11), - RealScalar(-4.48817645568941e-12), - RealScalar(-1.12204411392298e-12), - RealScalar(-2.80511028480785e-13), - RealScalar(-7.01277571201985e-14), - RealScalar(-1.75319392800498e-14), - RealScalar(-4.38298482001247e-15), - RealScalar(-1.09574620500312e-15), - RealScalar(-2.73936551250781e-16), - RealScalar(-6.84841378126949e-17), - RealScalar(-1.71210344531737e-17), - RealScalar(-4.28025861329343e-18) - }; - - // m_minus_sin_2_PI_div_n_LUT[i] = -std::sin(2 * M_PI / std::pow(2,i)); - const RealScalar m_minus_sin_2_PI_div_n_LUT[32] = { - RealScalar(0.0), - RealScalar(0.0), - RealScalar(-1.00000000000000e+00), - RealScalar(-7.07106781186547e-01), - RealScalar(-3.82683432365090e-01), - RealScalar(-1.95090322016128e-01), - RealScalar(-9.80171403295606e-02), - RealScalar(-4.90676743274180e-02), - RealScalar(-2.45412285229123e-02), - RealScalar(-1.22715382857199e-02), - RealScalar(-6.13588464915448e-03), - RealScalar(-3.06795676296598e-03), - RealScalar(-1.53398018628477e-03), - RealScalar(-7.66990318742704e-04), - RealScalar(-3.83495187571396e-04), - RealScalar(-1.91747597310703e-04), - RealScalar(-9.58737990959773e-05), - RealScalar(-4.79368996030669e-05), - RealScalar(-2.39684498084182e-05), - RealScalar(-1.19842249050697e-05), - RealScalar(-5.99211245264243e-06), - RealScalar(-2.99605622633466e-06), - RealScalar(-1.49802811316901e-06), - RealScalar(-7.49014056584716e-07), - RealScalar(-3.74507028292384e-07), - RealScalar(-1.87253514146195e-07), - RealScalar(-9.36267570730981e-08), - RealScalar(-4.68133785365491e-08), - RealScalar(-2.34066892682746e-08), - RealScalar(-1.17033446341373e-08), - RealScalar(-5.85167231706864e-09), - RealScalar(-2.92583615853432e-09) - }; -}; - -} // end namespace Eigen - -#endif // EIGEN_HAS_CONSTEXPR - - -#endif // EIGEN_CXX11_TENSOR_TENSOR_FFT_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h deleted file mode 100644 index fcee5f6..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ /dev/null @@ -1,389 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H -#define EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H - -namespace Eigen { - -/** \class TensorFixedSize - * \ingroup CXX11_Tensor_Module - * - * \brief The fixed sized version of the tensor class. - * - * The fixed sized equivalent of - * Eigen::Tensor t(3, 5, 7); - * is - * Eigen::TensorFixedSize> t; - */ - -template -class TensorFixedSize : public TensorBase > -{ - public: - typedef TensorFixedSize Self; - typedef TensorBase > Base; - typedef typename Eigen::internal::nested::type Nested; - typedef typename internal::traits::StorageKind StorageKind; - typedef typename internal::traits::Index Index; - typedef Scalar_ Scalar; - typedef typename NumTraits::Real RealScalar; - typedef typename Base::CoeffReturnType CoeffReturnType; - - static const int Options = Options_; - - enum { - IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0), - Layout = Options_ & RowMajor ? RowMajor : ColMajor, - CoordAccess = true, - RawAccess = true - }; - - typedef Dimensions_ Dimensions; - static const std::size_t NumIndices = Dimensions::count; - - protected: - TensorStorage m_storage; - - public: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } - - // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED - // work, because that uses base().coeffRef() - and we don't yet - // implement a similar class hierarchy - inline Self& base() { return *this; } - inline const Self& base() const { return *this; } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const - { - // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return coeff(array{{firstIndex, otherIndices...}}); - } -#endif - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& coeff(const array& indices) const - { - eigen_internal_assert(checkIndexRange(indices)); - return m_storage.data()[linearizedIndex(indices)]; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const - { - eigen_internal_assert(index >= 0 && index < size()); - return m_storage.data()[index]; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& coeff() const - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - return m_storage.data()[0]; - } - - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices) - { - // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return coeffRef(array{{firstIndex, otherIndices...}}); - } -#endif - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& coeffRef(const array& indices) - { - eigen_internal_assert(checkIndexRange(indices)); - return m_storage.data()[linearizedIndex(indices)]; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) - { - eigen_internal_assert(index >= 0 && index < size()); - return m_storage.data()[index]; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& coeffRef() - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - return m_storage.data()[0]; - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const - { - // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return this->operator()(array{{firstIndex, otherIndices...}}); - } -#else - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const - { - if (Options&RowMajor) { - const Index index = i1 + i0 * m_storage.dimensions()[1]; - return m_storage.data()[index]; - } else { - const Index index = i0 + i1 * m_storage.dimensions()[0]; - return m_storage.data()[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const - { - if (Options&RowMajor) { - const Index index = i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0); - return m_storage.data()[index]; - } else { - const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * i2); - return m_storage.data()[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const - { - if (Options&RowMajor) { - const Index index = i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0)); - return m_storage.data()[index]; - } else { - const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * i3)); - return m_storage.data()[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const - { - if (Options&RowMajor) { - const Index index = i4 + m_storage.dimensions()[4] * (i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0))); - return m_storage.data()[index]; - } else { - const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * (i3 + m_storage.dimensions()[3] * i4))); - return m_storage.data()[index]; - } - } -#endif - - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const - { - eigen_assert(checkIndexRange(indices)); - return coeff(indices); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const - { - eigen_internal_assert(index >= 0 && index < size()); - return coeff(index); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()() const - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - return coeff(); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const - { - // The bracket operator is only for vectors, use the parenthesis operator instead. - EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE); - return coeff(index); - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) - { - // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - return operator()(array{{firstIndex, otherIndices...}}); - } -#else - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) - { - if (Options&RowMajor) { - const Index index = i1 + i0 * m_storage.dimensions()[1]; - return m_storage.data()[index]; - } else { - const Index index = i0 + i1 * m_storage.dimensions()[0]; - return m_storage.data()[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2) - { - if (Options&RowMajor) { - const Index index = i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0); - return m_storage.data()[index]; - } else { - const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * i2); - return m_storage.data()[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) - { - if (Options&RowMajor) { - const Index index = i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0)); - return m_storage.data()[index]; - } else { - const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * i3)); - return m_storage.data()[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) - { - if (Options&RowMajor) { - const Index index = i4 + m_storage.dimensions()[4] * (i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0))); - return m_storage.data()[index]; - } else { - const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * (i3 + m_storage.dimensions()[3] * i4))); - return m_storage.data()[index]; - } - } -#endif - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) - { - eigen_assert(checkIndexRange(indices)); - return coeffRef(indices); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index index) - { - eigen_assert(index >= 0 && index < size()); - return coeffRef(index); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()() - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); - return coeffRef(); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator[](Index index) - { - // The bracket operator is only for vectors, use the parenthesis operator instead - EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE) - return coeffRef(index); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorFixedSize() - : m_storage() - { - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorFixedSize(const Self& other) - : m_storage(other.m_storage) - { - } - -#if EIGEN_HAS_RVALUE_REFERENCES - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(Self&& other) - : m_storage(other.m_storage) - { - } -#endif - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other.derived()); - internal::TensorExecutor::run(assign, DefaultDevice()); - } - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other.derived()); - internal::TensorExecutor::run(assign, DefaultDevice()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorFixedSize& operator=(const TensorFixedSize& other) - { - // FIXME: check that the dimensions of other match the dimensions of *this. - // Unfortunately this isn't possible yet when the rhs is an expression. - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorFixedSize& operator=(const OtherDerived& other) - { - // FIXME: check that the dimensions of other match the dimensions of *this. - // Unfortunately this isn't possible yet when the rhs is an expression. - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - protected: - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE bool checkIndexRange(const array& /*indices*/) const - { - using internal::array_apply_and_reduce; - using internal::array_zip_and_reduce; - using internal::greater_equal_zero_op; - using internal::logical_and_op; - using internal::lesser_op; - - return true; - // check whether the indices are all >= 0 - /* array_apply_and_reduce(indices) && - // check whether the indices fit in the dimensions - array_zip_and_reduce(indices, m_storage.dimensions());*/ - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index linearizedIndex(const array& indices) const - { - if (Options&RowMajor) { - return m_storage.dimensions().IndexOfRowMajor(indices); - } else { - return m_storage.dimensions().IndexOfColMajor(indices); - } - } -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h deleted file mode 100644 index 8bece4e..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ /dev/null @@ -1,169 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H -#define EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H - -namespace Eigen { - -namespace internal { -template class MakePointer_> -struct traits > -{ - // Type promotion to handle the case where the types of the lhs and the rhs are different. - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename traits::StorageKind StorageKind; - typedef typename traits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; - - enum { - Flags = 0 - }; - template struct MakePointer { - // Intermediate typedef to workaround MSVC issue. - typedef MakePointer_ MakePointerT; - typedef typename MakePointerT::Type Type; - }; -}; - -template class MakePointer_> -struct eval, Eigen::Dense> -{ - typedef const TensorForcedEvalOp& type; -}; - -template class MakePointer_> -struct nested, 1, typename eval >::type> -{ - typedef TensorForcedEvalOp type; -}; - -} // end namespace internal - - - -// FIXME use proper doxygen documentation (e.g. \tparam MakePointer_) - -/** \class TensorForcedEvalOp - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor reshaping class. - * - * - */ -/// `template class MakePointer_` is added to convert the host pointer to the device pointer. -/// It is added due to the fact that for our device compiler `T*` is not allowed. -/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer `T`. -/// This is done through our `MakePointer_` class. By default the Type in the `MakePointer_` is `T*` . -/// Therefore, by adding the default value, we managed to convert the type and it does not break any -/// existing code as its default value is `T*`. -template class MakePointer_> -class TensorForcedEvalOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename internal::remove_const::type CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorForcedEvalOp(const XprType& expr) - : m_xpr(expr) {} - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; -}; - - -template class MakePointer_> -struct TensorEvaluator, Device> -{ - typedef TensorForcedEvalOp XprType; - typedef typename ArgType::Scalar Scalar; - typedef typename TensorEvaluator::Dimensions Dimensions; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = true, - PacketAccess = (PacketSize > 1), - Layout = TensorEvaluator::Layout, - RawAccess = true - }; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - /// op_ is used for sycl - : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) - { } - - EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { - const Index numValues = internal::array_prod(m_impl.dimensions()); - m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType)); - // Should initialize the memory in case we're dealing with non POD types. - if (NumTraits::RequireInitialization) { - for (Index i = 0; i < numValues; ++i) { - new(m_buffer+i) CoeffReturnType(); - } - } - typedef TensorEvalToOp< const typename internal::remove_const::type > EvalTo; - EvalTo evalToTmp(m_buffer, m_op); - const bool PacketAccess = internal::IsVectorizable::value; - internal::TensorExecutor::type, PacketAccess>::run(evalToTmp, m_device); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_device.deallocate(m_buffer); - m_buffer = NULL; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return m_buffer[index]; - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return internal::ploadt(m_buffer + index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC typename MakePointer::Type data() const { return m_buffer; } - - /// required by sycl in order to extract the sycl accessor - const TensorEvaluator& impl() { return m_impl; } - /// used by sycl in order to build the sycl buffer - const Device& device() const{return m_device;} - private: - TensorEvaluator m_impl; - const ArgType m_op; - const Device& m_device; - typename MakePointer::Type m_buffer; -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h deleted file mode 100644 index 52b803d..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ /dev/null @@ -1,109 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H -#define EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H - -namespace Eigen { - -// MakePointer class is used as a container of the adress space of the pointer -// on the host and on the device. From the host side it generates the T* pointer -// and when EIGEN_USE_SYCL is used it construct a buffer with a map_allocator to -// T* m_data on the host. It is always called on the device. -// Specialisation of MakePointer class for creating the sycl buffer with -// map_allocator. -template struct MakePointer { - typedef T* Type; -}; - -template class MakePointer_ = MakePointer> class TensorMap; -template class Tensor; -template class TensorFixedSize; -template class TensorRef; -template class TensorBase; - -template class TensorCwiseNullaryOp; -template class TensorCwiseUnaryOp; -template class TensorCwiseBinaryOp; -template class TensorCwiseTernaryOp; -template class TensorSelectOp; -template class MakePointer_ = MakePointer > class TensorReductionOp; -template class TensorIndexTupleOp; -template class TensorTupleReducerOp; -template class TensorConcatenationOp; -template class TensorContractionOp; -template class TensorConversionOp; -template class TensorConvolutionOp; -template class TensorFFTOp; -template class TensorPatchOp; -template class TensorImagePatchOp; -template class TensorVolumePatchOp; -template class TensorBroadcastingOp; -template class TensorChippingOp; -template class TensorReshapingOp; -template class TensorLayoutSwapOp; -template class TensorSlicingOp; -template class TensorReverseOp; -template class TensorPaddingOp; -template class TensorShufflingOp; -template class TensorStridingOp; -template class TensorStridingSlicingOp; -template class TensorInflationOp; -template class TensorGeneratorOp; -template class TensorAssignOp; -template class TensorScanOp; - -template class TensorCustomUnaryOp; -template class TensorCustomBinaryOp; - -template class MakePointer_ = MakePointer> class TensorEvalToOp; -template class MakePointer_ = MakePointer> class TensorForcedEvalOp; - -template class TensorDevice; -template struct TensorEvaluator; - -struct DefaultDevice; -struct ThreadPoolDevice; -struct GpuDevice; -struct SyclDevice; - -enum FFTResultType { - RealPart = 0, - ImagPart = 1, - BothParts = 2 -}; - -enum FFTDirection { - FFT_FORWARD = 0, - FFT_REVERSE = 1 -}; - - -namespace internal { - -template -struct IsVectorizable { - static const bool value = TensorEvaluator::PacketAccess; -}; - -template -struct IsVectorizable { - static const bool value = TensorEvaluator::PacketAccess && - TensorEvaluator::IsAligned; -}; - -template ::value> -class TensorExecutor; - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h deleted file mode 100644 index d73f6dc..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ /dev/null @@ -1,489 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H -#define EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H - -namespace Eigen { -namespace internal { - - -/** \internal - * \brief Template functor to compute the modulo between an array and a scalar. - */ -template -struct scalar_mod_op { - EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {} - EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a % m_divisor; } - const Scalar m_divisor; -}; -template -struct functor_traits > -{ enum { Cost = scalar_div_cost::value, PacketAccess = false }; }; - - -/** \internal - * \brief Template functor to compute the modulo between 2 arrays. - */ -template -struct scalar_mod2_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op); - EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; } -}; -template -struct functor_traits > -{ enum { Cost = scalar_div_cost::value, PacketAccess = false }; }; - -template -struct scalar_fmod_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op); - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar - operator()(const Scalar& a, const Scalar& b) const { - return numext::fmod(a, b); - } -}; -template -struct functor_traits > { - enum { Cost = 13, // Reciprocal throughput of FPREM on Haswell. - PacketAccess = false }; -}; - - -/** \internal - * \brief Template functor to compute the sigmoid of a scalar - * \sa class CwiseUnaryOp, ArrayBase::sigmoid() - */ -template -struct scalar_sigmoid_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { - const T one = T(1); - return one / (one + numext::exp(-x)); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Packet packetOp(const Packet& x) const { - const Packet one = pset1(T(1)); - return pdiv(one, padd(one, pexp(pnegate(x)))); - } -}; - -template -struct functor_traits > { - enum { - Cost = NumTraits::AddCost * 2 + NumTraits::MulCost * 6, - PacketAccess = packet_traits::HasAdd && packet_traits::HasDiv && - packet_traits::HasNegate && packet_traits::HasExp - }; -}; - - -template -struct reducer_traits { - enum { - Cost = 1, - PacketAccess = false - }; -}; - -// Standard reduction functors -template struct SumReducer -{ - static const bool PacketAccess = packet_traits::HasAdd; - static const bool IsStateful = false; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { - internal::scalar_sum_op sum_op; - *accum = sum_op(*accum, t); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { - (*accum) = padd(*accum, p); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - internal::scalar_cast_op conv; - return conv(0); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { - return pset1(initialize()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { - return accum; - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { - return vaccum; - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { - internal::scalar_sum_op sum_op; - return sum_op(saccum, predux(vaccum)); - } -}; - -template -struct reducer_traits, Device> { - enum { - Cost = NumTraits::AddCost, - PacketAccess = PacketType::HasAdd - }; -}; - - -template struct MeanReducer -{ - static const bool PacketAccess = packet_traits::HasAdd && !NumTraits::IsInteger; - static const bool IsStateful = true; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - MeanReducer() : scalarCount_(0), packetCount_(0) { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) { - internal::scalar_sum_op sum_op; - *accum = sum_op(*accum, t); - scalarCount_++; - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) { - (*accum) = padd(*accum, p); - packetCount_++; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - internal::scalar_cast_op conv; - return conv(0); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { - return pset1(initialize()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { - return accum / scalarCount_; - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { - return pdiv(vaccum, pset1(packetCount_)); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { - internal::scalar_sum_op sum_op; - return sum_op(saccum, predux(vaccum)) / (scalarCount_ + packetCount_ * unpacket_traits::size); - } - - protected: - DenseIndex scalarCount_; - DenseIndex packetCount_; -}; - -template -struct reducer_traits, Device> { - enum { - Cost = NumTraits::AddCost, - PacketAccess = PacketType::HasAdd - }; -}; - - -template -struct MinMaxBottomValue { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { - return Eigen::NumTraits::lowest(); - } -}; -template -struct MinMaxBottomValue { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { - return -Eigen::NumTraits::infinity(); - } -}; -template -struct MinMaxBottomValue { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { - return Eigen::NumTraits::highest(); - } -}; -template -struct MinMaxBottomValue { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { - return Eigen::NumTraits::infinity(); - } -}; - - -template struct MaxReducer -{ - static const bool PacketAccess = packet_traits::HasMax; - static const bool IsStateful = false; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { - if (t > *accum) { *accum = t; } - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { - (*accum) = pmax(*accum, p); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - return MinMaxBottomValue::IsInteger>::bottom_value(); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { - return pset1(initialize()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { - return accum; - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { - return vaccum; - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { - return numext::maxi(saccum, predux_max(vaccum)); - } -}; - -template -struct reducer_traits, Device> { - enum { - Cost = NumTraits::AddCost, - PacketAccess = PacketType::HasMax - }; -}; - - -template struct MinReducer -{ - static const bool PacketAccess = packet_traits::HasMin; - static const bool IsStateful = false; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { - if (t < *accum) { *accum = t; } - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { - (*accum) = pmin(*accum, p); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - return MinMaxBottomValue::IsInteger>::bottom_value(); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { - return pset1(initialize()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { - return accum; - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { - return vaccum; - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { - return numext::mini(saccum, predux_min(vaccum)); - } -}; - -template -struct reducer_traits, Device> { - enum { - Cost = NumTraits::AddCost, - PacketAccess = PacketType::HasMin - }; -}; - - -template struct ProdReducer -{ - static const bool PacketAccess = packet_traits::HasMul; - static const bool IsStateful = false; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { - internal::scalar_product_op prod_op; - (*accum) = prod_op(*accum, t); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { - (*accum) = pmul(*accum, p); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - internal::scalar_cast_op conv; - return conv(1); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { - return pset1(initialize()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { - return accum; - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { - return vaccum; - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { - internal::scalar_product_op prod_op; - return prod_op(saccum, predux_mul(vaccum)); - } -}; - -template -struct reducer_traits, Device> { - enum { - Cost = NumTraits::MulCost, - PacketAccess = PacketType::HasMul - }; -}; - - -struct AndReducer -{ - static const bool PacketAccess = false; - static const bool IsStateful = false; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { - *accum = *accum && t; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const { - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const { - return accum; - } -}; - -template -struct reducer_traits { - enum { - Cost = 1, - PacketAccess = false - }; -}; - - -struct OrReducer { - static const bool PacketAccess = false; - static const bool IsStateful = false; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { - *accum = *accum || t; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const { - return false; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const { - return accum; - } -}; - -template -struct reducer_traits { - enum { - Cost = 1, - PacketAccess = false - }; -}; - - -// Argmin/Argmax reducers -template struct ArgMaxTupleReducer -{ - static const bool PacketAccess = false; - static const bool IsStateful = false; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { - if (t.second > accum->second) { *accum = t; } - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - return T(0, NumTraits::lowest()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const { - return accum; - } -}; - -template -struct reducer_traits, Device> { - enum { - Cost = NumTraits::AddCost, - PacketAccess = false - }; -}; - - -template struct ArgMinTupleReducer -{ - static const bool PacketAccess = false; - static const bool IsStateful = false; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const { - if (t.second < accum->second) { *accum = t; } - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { - return T(0, NumTraits::highest()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const { - return accum; - } -}; - -template -struct reducer_traits, Device> { - enum { - Cost = NumTraits::AddCost, - PacketAccess = false - }; -}; - - -template -class GaussianGenerator { - public: - static const bool PacketAccess = false; - - EIGEN_DEVICE_FUNC GaussianGenerator(const array& means, - const array& std_devs) - : m_means(means) - { - for (size_t i = 0; i < NumDims; ++i) { - m_two_sigmas[i] = std_devs[i] * std_devs[i] * 2; - } - } - - EIGEN_DEVICE_FUNC T operator()(const array& coordinates) const { - T tmp = T(0); - for (size_t i = 0; i < NumDims; ++i) { - T offset = coordinates[i] - m_means[i]; - tmp += offset * offset / m_two_sigmas[i]; - } - return numext::exp(-tmp); - } - - private: - array m_means; - array m_two_sigmas; -}; - -template -struct functor_traits > { - enum { - Cost = NumDims * (2 * NumTraits::AddCost + NumTraits::MulCost + - functor_traits >::Cost) + - functor_traits >::Cost, - PacketAccess = GaussianGenerator::PacketAccess - }; -}; - -} // end namespace internal -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h deleted file mode 100644 index e27753b..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ /dev/null @@ -1,185 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H -#define EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H - -namespace Eigen { - -/** \class TensorGeneratorOp - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor generator class. - * - * - */ -namespace internal { -template -struct traits > : public traits -{ - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorGeneratorOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorGeneratorOp type; -}; - -} // end namespace internal - - - -template -class TensorGeneratorOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorGeneratorOp(const XprType& expr, const Generator& generator) - : m_xpr(expr), m_generator(generator) {} - - EIGEN_DEVICE_FUNC - const Generator& generator() const { return m_generator; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; - const Generator m_generator; -}; - - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorGeneratorOp XprType; - typedef typename XprType::Index Index; - typedef typename TensorEvaluator::Dimensions Dimensions; - static const int NumDims = internal::array_size::value; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - enum { - IsAligned = false, - PacketAccess = (internal::unpacket_traits::size > 1), - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_generator(op.generator()) - { - TensorEvaluator impl(op.expression(), device); - m_dimensions = impl.dimensions(); - - if (static_cast(Layout) == static_cast(ColMajor)) { - m_strides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; - } - } else { - m_strides[NumDims - 1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - array coords; - extract_coordinates(index, coords); - return m_generator(coords); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < dimensions().TotalSize()); - - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; - for (int i = 0; i < packetSize; ++i) { - values[i] = coeff(index+i); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool) const { - // TODO(rmlarsen): This is just a placeholder. Define interface to make - // generators return their cost. - return TensorOpCost(0, 0, TensorOpCost::AddCost() + - TensorOpCost::MulCost()); - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - - protected: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void extract_coordinates(Index index, array& coords) const { - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_strides[i]; - index -= idx * m_strides[i]; - coords[i] = idx; - } - coords[0] = index; - } else { - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_strides[i]; - index -= idx * m_strides[i]; - coords[i] = idx; - } - coords[NumDims-1] = index; - } - } - - Dimensions m_dimensions; - array m_strides; - Generator m_generator; -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h deleted file mode 100644 index 665b861..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h +++ /dev/null @@ -1,33 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Eugene Brevdo -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H -#define EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H - -namespace Eigen { - -/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given tensors. - * - * This function computes the regularized incomplete beta function (integral). - * - */ -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const - TensorCwiseTernaryOp, - const ADerived, const BDerived, const XDerived> - betainc(const ADerived& a, const BDerived& b, const XDerived& x) { - return TensorCwiseTernaryOp< - internal::scalar_betainc_op, const ADerived, - const BDerived, const XDerived>( - a, b, x, internal::scalar_betainc_op()); -} - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h deleted file mode 100644 index a901c5d..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h +++ /dev/null @@ -1,79 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_IO_H -#define EIGEN_CXX11_TENSOR_TENSOR_IO_H - -namespace Eigen { - -namespace internal { - -// Print the tensor as a 2d matrix -template -struct TensorPrinter { - static void run (std::ostream& os, const Tensor& tensor) { - typedef typename internal::remove_const::type Scalar; - typedef typename Tensor::Index Index; - const Index total_size = internal::array_prod(tensor.dimensions()); - if (total_size > 0) { - const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions()); - static const int layout = Tensor::Layout; - Map > matrix(const_cast(tensor.data()), first_dim, total_size/first_dim); - os << matrix; - } - } -}; - - -// Print the tensor as a vector -template -struct TensorPrinter { - static void run (std::ostream& os, const Tensor& tensor) { - typedef typename internal::remove_const::type Scalar; - typedef typename Tensor::Index Index; - const Index total_size = internal::array_prod(tensor.dimensions()); - if (total_size > 0) { - Map > array(const_cast(tensor.data()), total_size); - os << array; - } - } -}; - - -// Print the tensor as a scalar -template -struct TensorPrinter { - static void run (std::ostream& os, const Tensor& tensor) { - os << tensor.coeff(0); - } -}; -} - -template -std::ostream& operator << (std::ostream& os, const TensorBase& expr) { - typedef TensorEvaluator, DefaultDevice> Evaluator; - typedef typename Evaluator::Dimensions Dimensions; - - // Evaluate the expression if needed - TensorForcedEvalOp eval = expr.eval(); - Evaluator tensor(eval, DefaultDevice()); - tensor.evalSubExprsIfNeeded(NULL); - - // Print the result - static const int rank = internal::array_size::value; - internal::TensorPrinter::run(os, tensor); - - // Cleanup. - tensor.cleanup(); - return os; -} - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_IO_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h deleted file mode 100644 index 566856e..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ /dev/null @@ -1,509 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H -#define EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H - -namespace Eigen { - -/** \class TensorImagePatch - * \ingroup CXX11_Tensor_Module - * - * \brief Patch extraction specialized for image processing. - * This assumes that the input has a least 3 dimensions ordered as follow: - * 1st dimension: channels (of size d) - * 2nd dimension: rows (of size r) - * 3rd dimension: columns (of size c) - * There can be additional dimensions such as time (for video) or batch (for - * bulk processing after the first 3. - * Calling the image patch code with patch_rows and patch_cols is equivalent - * to calling the regular patch extraction code with parameters d, patch_rows, - * patch_cols, and 1 for all the additional dimensions. - */ -namespace internal { -template -struct traits > : public traits -{ - typedef typename internal::remove_const::type Scalar; - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions + 1; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorImagePatchOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorImagePatchOp type; -}; - -} // end namespace internal - -template -class TensorImagePatchOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols, - DenseIndex row_strides, DenseIndex col_strides, - DenseIndex in_row_strides, DenseIndex in_col_strides, - DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, - PaddingType padding_type, Scalar padding_value) - : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols), - m_row_strides(row_strides), m_col_strides(col_strides), - m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), - m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), - m_padding_explicit(false), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0), - m_padding_type(padding_type), m_padding_value(padding_value) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols, - DenseIndex row_strides, DenseIndex col_strides, - DenseIndex in_row_strides, DenseIndex in_col_strides, - DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, - DenseIndex padding_top, DenseIndex padding_bottom, - DenseIndex padding_left, DenseIndex padding_right, - Scalar padding_value) - : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols), - m_row_strides(row_strides), m_col_strides(col_strides), - m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), - m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), - m_padding_explicit(true), m_padding_top(padding_top), m_padding_bottom(padding_bottom), - m_padding_left(padding_left), m_padding_right(padding_right), - m_padding_type(PADDING_VALID), m_padding_value(padding_value) {} - - EIGEN_DEVICE_FUNC - DenseIndex patch_rows() const { return m_patch_rows; } - EIGEN_DEVICE_FUNC - DenseIndex patch_cols() const { return m_patch_cols; } - EIGEN_DEVICE_FUNC - DenseIndex row_strides() const { return m_row_strides; } - EIGEN_DEVICE_FUNC - DenseIndex col_strides() const { return m_col_strides; } - EIGEN_DEVICE_FUNC - DenseIndex in_row_strides() const { return m_in_row_strides; } - EIGEN_DEVICE_FUNC - DenseIndex in_col_strides() const { return m_in_col_strides; } - EIGEN_DEVICE_FUNC - DenseIndex row_inflate_strides() const { return m_row_inflate_strides; } - EIGEN_DEVICE_FUNC - DenseIndex col_inflate_strides() const { return m_col_inflate_strides; } - EIGEN_DEVICE_FUNC - bool padding_explicit() const { return m_padding_explicit; } - EIGEN_DEVICE_FUNC - DenseIndex padding_top() const { return m_padding_top; } - EIGEN_DEVICE_FUNC - DenseIndex padding_bottom() const { return m_padding_bottom; } - EIGEN_DEVICE_FUNC - DenseIndex padding_left() const { return m_padding_left; } - EIGEN_DEVICE_FUNC - DenseIndex padding_right() const { return m_padding_right; } - EIGEN_DEVICE_FUNC - PaddingType padding_type() const { return m_padding_type; } - EIGEN_DEVICE_FUNC - Scalar padding_value() const { return m_padding_value; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; - const DenseIndex m_patch_rows; - const DenseIndex m_patch_cols; - const DenseIndex m_row_strides; - const DenseIndex m_col_strides; - const DenseIndex m_in_row_strides; - const DenseIndex m_in_col_strides; - const DenseIndex m_row_inflate_strides; - const DenseIndex m_col_inflate_strides; - const bool m_padding_explicit; - const DenseIndex m_padding_top; - const DenseIndex m_padding_bottom; - const DenseIndex m_padding_left; - const DenseIndex m_padding_right; - const PaddingType m_padding_type; - const Scalar m_padding_value; -}; - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorImagePatchOp XprType; - typedef typename XprType::Index Index; - static const int NumInputDims = internal::array_size::Dimensions>::value; - static const int NumDims = NumInputDims + 1; - typedef DSizes Dimensions; - typedef typename internal::remove_const::type Scalar; - typedef TensorEvaluator, - Device> Self; - typedef TensorEvaluator Impl; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device) - { - EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE); - - m_paddingValue = op.padding_value(); - - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - - // Caches a few variables. - if (static_cast(Layout) == static_cast(ColMajor)) { - m_inputDepth = input_dims[0]; - m_inputRows = input_dims[1]; - m_inputCols = input_dims[2]; - } else { - m_inputDepth = input_dims[NumInputDims-1]; - m_inputRows = input_dims[NumInputDims-2]; - m_inputCols = input_dims[NumInputDims-3]; - } - - m_row_strides = op.row_strides(); - m_col_strides = op.col_strides(); - - // Input strides and effective input/patch size - m_in_row_strides = op.in_row_strides(); - m_in_col_strides = op.in_col_strides(); - m_row_inflate_strides = op.row_inflate_strides(); - m_col_inflate_strides = op.col_inflate_strides(); - // The "effective" input rows and input cols are the input rows and cols - // after inflating them with zeros. - // For examples, a 2x3 matrix with row_inflate_strides and - // col_inflate_strides of 2 comes from: - // A B C - // D E F - // - // to a matrix is 3 x 5: - // - // A . B . C - // . . . . . - // D . E . F - - m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1; - m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1; - m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1); - m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1); - - if (op.padding_explicit()) { - m_outputRows = numext::ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) / static_cast(m_row_strides)); - m_outputCols = numext::ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) / static_cast(m_col_strides)); - m_rowPaddingTop = op.padding_top(); - m_colPaddingLeft = op.padding_left(); - } else { - // Computing padding from the type - switch (op.padding_type()) { - case PADDING_VALID: - m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast(m_row_strides)); - m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast(m_col_strides)); - // Calculate the padding - m_rowPaddingTop = numext::maxi(0, ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2); - m_colPaddingLeft = numext::maxi(0, ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2); - break; - case PADDING_SAME: - m_outputRows = numext::ceil(m_input_rows_eff / static_cast(m_row_strides)); - m_outputCols = numext::ceil(m_input_cols_eff / static_cast(m_col_strides)); - // Calculate the padding - m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2; - m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2; - break; - default: - eigen_assert(false && "unexpected padding"); - } - } - eigen_assert(m_outputRows > 0); - eigen_assert(m_outputCols > 0); - - // Dimensions for result of extraction. - if (static_cast(Layout) == static_cast(ColMajor)) { - // ColMajor - // 0: depth - // 1: patch_rows - // 2: patch_cols - // 3: number of patches - // 4 and beyond: anything else (such as batch). - m_dimensions[0] = input_dims[0]; - m_dimensions[1] = op.patch_rows(); - m_dimensions[2] = op.patch_cols(); - m_dimensions[3] = m_outputRows * m_outputCols; - for (int i = 4; i < NumDims; ++i) { - m_dimensions[i] = input_dims[i-1]; - } - } else { - // RowMajor - // NumDims-1: depth - // NumDims-2: patch_rows - // NumDims-3: patch_cols - // NumDims-4: number of patches - // NumDims-5 and beyond: anything else (such as batch). - m_dimensions[NumDims-1] = input_dims[NumInputDims-1]; - m_dimensions[NumDims-2] = op.patch_rows(); - m_dimensions[NumDims-3] = op.patch_cols(); - m_dimensions[NumDims-4] = m_outputRows * m_outputCols; - for (int i = NumDims-5; i >= 0; --i) { - m_dimensions[i] = input_dims[i]; - } - } - - // Strides for moving the patch in various dimensions. - if (static_cast(Layout) == static_cast(ColMajor)) { - m_colStride = m_dimensions[1]; - m_patchStride = m_colStride * m_dimensions[2] * m_dimensions[0]; - m_otherStride = m_patchStride * m_dimensions[3]; - } else { - m_colStride = m_dimensions[NumDims-2]; - m_patchStride = m_colStride * m_dimensions[NumDims-3] * m_dimensions[NumDims-1]; - m_otherStride = m_patchStride * m_dimensions[NumDims-4]; - } - - // Strides for navigating through the input tensor. - m_rowInputStride = m_inputDepth; - m_colInputStride = m_inputDepth * m_inputRows; - m_patchInputStride = m_inputDepth * m_inputRows * m_inputCols; - - // Fast representations of different variables. - m_fastOtherStride = internal::TensorIntDivisor(m_otherStride); - m_fastPatchStride = internal::TensorIntDivisor(m_patchStride); - m_fastColStride = internal::TensorIntDivisor(m_colStride); - m_fastInflateRowStride = internal::TensorIntDivisor(m_row_inflate_strides); - m_fastInflateColStride = internal::TensorIntDivisor(m_col_inflate_strides); - m_fastInputColsEff = internal::TensorIntDivisor(m_input_cols_eff); - - // Number of patches in the width dimension. - m_fastOutputRows = internal::TensorIntDivisor(m_outputRows); - if (static_cast(Layout) == static_cast(ColMajor)) { - m_fastOutputDepth = internal::TensorIntDivisor(m_dimensions[0]); - } else { - m_fastOutputDepth = internal::TensorIntDivisor(m_dimensions[NumDims-1]); - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - // Patch index corresponding to the passed in index. - const Index patchIndex = index / m_fastPatchStride; - // Find the offset of the element wrt the location of the first element. - const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth; - - // Other ways to index this element. - const Index otherIndex = (NumDims == 4) ? 0 : index / m_fastOtherStride; - const Index patch2DIndex = (NumDims == 4) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride; - - // Calculate col index in the input original tensor. - const Index colIndex = patch2DIndex / m_fastOutputRows; - const Index colOffset = patchOffset / m_fastColStride; - const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft; - const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInflateColStride) : 0); - if (inputCol < 0 || inputCol >= m_input_cols_eff || - ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) { - return Scalar(m_paddingValue); - } - - // Calculate row index in the original input tensor. - const Index rowIndex = patch2DIndex - colIndex * m_outputRows; - const Index rowOffset = patchOffset - colOffset * m_colStride; - const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop; - const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInflateRowStride) : 0); - if (inputRow < 0 || inputRow >= m_input_rows_eff || - ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) { - return Scalar(m_paddingValue); - } - - const int depth_index = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - 1; - const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index]; - - const Index inputIndex = depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex * m_patchInputStride; - return m_impl.coeff(inputIndex); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) { - return packetWithPossibleZero(index); - } - - const Index indices[2] = {index, index + PacketSize - 1}; - const Index patchIndex = indices[0] / m_fastPatchStride; - if (patchIndex != indices[1] / m_fastPatchStride) { - return packetWithPossibleZero(index); - } - const Index otherIndex = (NumDims == 4) ? 0 : indices[0] / m_fastOtherStride; - eigen_assert(otherIndex == indices[1] / m_fastOtherStride); - - // Find the offset of the element wrt the location of the first element. - const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth, - (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth}; - - const Index patch2DIndex = (NumDims == 4) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride; - eigen_assert(patch2DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride); - - const Index colIndex = patch2DIndex / m_fastOutputRows; - const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride}; - - // Calculate col indices in the original input tensor. - const Index inputCols[2] = {colIndex * m_col_strides + colOffsets[0] - - m_colPaddingLeft, colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft}; - if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) { - return internal::pset1(Scalar(m_paddingValue)); - } - - if (inputCols[0] == inputCols[1]) { - const Index rowIndex = patch2DIndex - colIndex * m_outputRows; - const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0]*m_colStride, patchOffsets[1] - colOffsets[1]*m_colStride}; - eigen_assert(rowOffsets[0] <= rowOffsets[1]); - // Calculate col indices in the original input tensor. - const Index inputRows[2] = {rowIndex * m_row_strides + rowOffsets[0] - - m_rowPaddingTop, rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop}; - - if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) { - return internal::pset1(Scalar(m_paddingValue)); - } - - if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) { - // no padding - const int depth_index = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - 1; - const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index]; - const Index inputIndex = depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex * m_patchInputStride; - return m_impl.template packet(inputIndex); - } - } - - return packetWithPossibleZero(index); - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - - const TensorEvaluator& impl() const { return m_impl; } - - Index rowPaddingTop() const { return m_rowPaddingTop; } - Index colPaddingLeft() const { return m_colPaddingLeft; } - Index outputRows() const { return m_outputRows; } - Index outputCols() const { return m_outputCols; } - Index userRowStride() const { return m_row_strides; } - Index userColStride() const { return m_col_strides; } - Index userInRowStride() const { return m_in_row_strides; } - Index userInColStride() const { return m_in_col_strides; } - Index rowInflateStride() const { return m_row_inflate_strides; } - Index colInflateStride() const { return m_col_inflate_strides; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost - costPerCoeff(bool vectorized) const { - // We conservatively estimate the cost for the code path where the computed - // index is inside the original image and - // TensorEvaluator::CoordAccess is false. - const double compute_cost = 3 * TensorOpCost::DivCost() + - 6 * TensorOpCost::MulCost() + - 8 * TensorOpCost::MulCost(); - return m_impl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); - } - - protected: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const - { - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - for (int i = 0; i < PacketSize; ++i) { - values[i] = coeff(index+i); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - - Dimensions m_dimensions; - - Index m_otherStride; - Index m_patchStride; - Index m_colStride; - Index m_row_strides; - Index m_col_strides; - - Index m_in_row_strides; - Index m_in_col_strides; - Index m_row_inflate_strides; - Index m_col_inflate_strides; - - Index m_input_rows_eff; - Index m_input_cols_eff; - Index m_patch_rows_eff; - Index m_patch_cols_eff; - - internal::TensorIntDivisor m_fastOtherStride; - internal::TensorIntDivisor m_fastPatchStride; - internal::TensorIntDivisor m_fastColStride; - internal::TensorIntDivisor m_fastInflateRowStride; - internal::TensorIntDivisor m_fastInflateColStride; - internal::TensorIntDivisor m_fastInputColsEff; - - Index m_rowInputStride; - Index m_colInputStride; - Index m_patchInputStride; - - Index m_inputDepth; - Index m_inputRows; - Index m_inputCols; - - Index m_outputRows; - Index m_outputCols; - - Index m_rowPaddingTop; - Index m_colPaddingLeft; - - internal::TensorIntDivisor m_fastOutputRows; - internal::TensorIntDivisor m_fastOutputDepth; - - Scalar m_paddingValue; - - TensorEvaluator m_impl; -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h deleted file mode 100644 index 3209fec..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ /dev/null @@ -1,725 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H -#define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H - - -#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES - -#define EIGEN_HAS_INDEX_LIST - -namespace Eigen { - -/** \internal - * - * \class TensorIndexList - * \ingroup CXX11_Tensor_Module - * - * \brief Set of classes used to encode a set of Tensor dimensions/indices. - * - * The indices in the list can be known at compile time or at runtime. A mix - * of static and dynamic indices can also be provided if needed. The tensor - * code will attempt to take advantage of the indices that are known at - * compile time to optimize the code it generates. - * - * This functionality requires a c++11 compliant compiler. If your compiler - * is older you need to use arrays of indices instead. - * - * Several examples are provided in the cxx11_tensor_index_list.cpp file. - * - * \sa Tensor - */ - -template -struct type2index { - static const DenseIndex value = n; - EIGEN_DEVICE_FUNC constexpr operator DenseIndex() const { return n; } - EIGEN_DEVICE_FUNC void set(DenseIndex val) { - eigen_assert(val == n); - } -}; - -// This can be used with IndexPairList to get compile-time constant pairs, -// such as IndexPairList, type2indexpair<3,4>>(). -template -struct type2indexpair { - static const DenseIndex first = f; - static const DenseIndex second = s; - - constexpr EIGEN_DEVICE_FUNC operator IndexPair() const { - return IndexPair(f, s); - } - - EIGEN_DEVICE_FUNC void set(const IndexPair& val) { - eigen_assert(val.first == f); - eigen_assert(val.second == s); - } -}; - - -template struct NumTraits > -{ - typedef DenseIndex Real; - enum { - IsComplex = 0, - RequireInitialization = false, - ReadCost = 1, - AddCost = 1, - MulCost = 1 - }; - - EIGEN_DEVICE_FUNC static inline Real epsilon() { return 0; } - EIGEN_DEVICE_FUNC static inline Real dummy_precision() { return 0; } - EIGEN_DEVICE_FUNC static inline Real highest() { return n; } - EIGEN_DEVICE_FUNC static inline Real lowest() { return n; } -}; - -namespace internal { -template -EIGEN_DEVICE_FUNC void update_value(T& val, DenseIndex new_val) { - val = new_val; -} -template -EIGEN_DEVICE_FUNC void update_value(type2index& val, DenseIndex new_val) { - val.set(new_val); -} - -template -EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair new_val) { - val = new_val; -} -template -EIGEN_DEVICE_FUNC void update_value(type2indexpair& val, IndexPair new_val) { - val.set(new_val); -} - - -template -struct is_compile_time_constant { - static constexpr bool value = false; -}; - -template -struct is_compile_time_constant > { - static constexpr bool value = true; -}; -template -struct is_compile_time_constant > { - static constexpr bool value = true; -}; -template -struct is_compile_time_constant& > { - static constexpr bool value = true; -}; -template -struct is_compile_time_constant& > { - static constexpr bool value = true; -}; - -template -struct is_compile_time_constant > { - static constexpr bool value = true; -}; -template -struct is_compile_time_constant > { - static constexpr bool value = true; -}; -template -struct is_compile_time_constant& > { - static constexpr bool value = true; -}; -template -struct is_compile_time_constant& > { - static constexpr bool value = true; -}; - - -template -struct IndexTuple; - -template -struct IndexTuple { - EIGEN_DEVICE_FUNC constexpr IndexTuple() : head(), others() { } - EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v, const O... o) : head(v), others(o...) { } - - constexpr static int count = 1 + sizeof...(O); - T head; - IndexTuple others; - typedef T Head; - typedef IndexTuple Other; -}; - -template - struct IndexTuple { - EIGEN_DEVICE_FUNC constexpr IndexTuple() : head() { } - EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v) : head(v) { } - - constexpr static int count = 1; - T head; - typedef T Head; -}; - - -template -struct IndexTupleExtractor; - -template -struct IndexTupleExtractor { - - typedef typename IndexTupleExtractor::ValType ValType; - - EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple& val) { - return IndexTupleExtractor::get_val(val.others); - } - - EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple& val) { - return IndexTupleExtractor::get_val(val.others); - } - template - EIGEN_DEVICE_FUNC static void set_val(IndexTuple& val, V& new_val) { - IndexTupleExtractor::set_val(val.others, new_val); - } - -}; - -template - struct IndexTupleExtractor<0, T, O...> { - - typedef T ValType; - - EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple& val) { - return val.head; - } - EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple& val) { - return val.head; - } - template - EIGEN_DEVICE_FUNC static void set_val(IndexTuple& val, V& new_val) { - val.head = new_val; - } -}; - - - -template -EIGEN_DEVICE_FUNC constexpr typename IndexTupleExtractor::ValType& array_get(IndexTuple& tuple) { - return IndexTupleExtractor::get_val(tuple); -} -template -EIGEN_DEVICE_FUNC constexpr const typename IndexTupleExtractor::ValType& array_get(const IndexTuple& tuple) { - return IndexTupleExtractor::get_val(tuple); -} -template - struct array_size > { - static const size_t value = IndexTuple::count; -}; -template - struct array_size > { - static const size_t value = IndexTuple::count; -}; - - - - -template -struct tuple_coeff { - template - EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex i, const IndexTuple& t) { - // return array_get(t) * (i == Idx) + tuple_coeff::get(i, t) * (i != Idx); - return (i == Idx ? array_get(t) : tuple_coeff::get(i, t)); - } - template - EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple& t, const ValueT& value) { - if (i == Idx) { - update_value(array_get(t), value); - } else { - tuple_coeff::set(i, t, value); - } - } - - template - EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple& t) { - return ((i == Idx) & is_compile_time_constant::ValType>::value) || - tuple_coeff::value_known_statically(i, t); - } - - template - EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple& t) { - return is_compile_time_constant::ValType>::value && - tuple_coeff::values_up_to_known_statically(t); - } - - template - EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple& t) { - return is_compile_time_constant::ValType>::value && - is_compile_time_constant::ValType>::value && - array_get(t) > array_get(t) && - tuple_coeff::values_up_to_statically_known_to_increase(t); - } -}; - -template -struct tuple_coeff<0, ValueT> { - template - EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex /*i*/, const IndexTuple& t) { - // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr - return array_get<0>(t)/* * (i == 0)*/; - } - template - EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple& t, const ValueT value) { - eigen_assert (i == 0); - update_value(array_get<0>(t), value); - } - template - EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple&) { - return is_compile_time_constant::ValType>::value & (i == 0); - } - - template - EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple&) { - return is_compile_time_constant::ValType>::value; - } - - template - EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple&) { - return true; - } -}; -} // namespace internal - - - -template -struct IndexList : internal::IndexTuple { - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex operator[] (const DenseIndex i) const { - return internal::tuple_coeff >::value-1, DenseIndex>::get(i, *this); - } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex get(const DenseIndex i) const { - return internal::tuple_coeff >::value-1, DenseIndex>::get(i, *this); - } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const DenseIndex value) { - return internal::tuple_coeff >::value-1, DenseIndex>::set(i, *this, value); - } - - EIGEN_DEVICE_FUNC constexpr IndexList(const internal::IndexTuple& other) : internal::IndexTuple(other) { } - EIGEN_DEVICE_FUNC constexpr IndexList(FirstType& first, OtherTypes... other) : internal::IndexTuple(first, other...) { } - EIGEN_DEVICE_FUNC constexpr IndexList() : internal::IndexTuple() { } - - EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const { - return internal::tuple_coeff >::value-1, DenseIndex>::value_known_statically(i, *this); - } - EIGEN_DEVICE_FUNC constexpr bool all_values_known_statically() const { - return internal::tuple_coeff >::value-1, DenseIndex>::values_up_to_known_statically(*this); - } - - EIGEN_DEVICE_FUNC constexpr bool values_statically_known_to_increase() const { - return internal::tuple_coeff >::value-1, DenseIndex>::values_up_to_statically_known_to_increase(*this); - } -}; - - -template -constexpr IndexList make_index_list(FirstType val1, OtherTypes... other_vals) { - return IndexList(val1, other_vals...); -} - - -template -struct IndexPairList : internal::IndexTuple { - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair operator[] (const DenseIndex i) const { - return internal::tuple_coeff >::value-1, IndexPair>::get(i, *this); - } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const IndexPair value) { - return internal::tuple_coeff>::value-1, IndexPair >::set(i, *this, value); - } - - EIGEN_DEVICE_FUNC constexpr IndexPairList(const internal::IndexTuple& other) : internal::IndexTuple(other) { } - EIGEN_DEVICE_FUNC constexpr IndexPairList() : internal::IndexTuple() { } - - EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const { - return internal::tuple_coeff >::value-1, DenseIndex>::value_known_statically(i, *this); - } -}; - -namespace internal { - -template size_t array_prod(const IndexList& sizes) { - size_t result = 1; - for (int i = 0; i < array_size >::value; ++i) { - result *= sizes[i]; - } - return result; -} - -template struct array_size > { - static const size_t value = array_size >::value; -}; -template struct array_size > { - static const size_t value = array_size >::value; -}; - -template struct array_size > { - static const size_t value = std::tuple_size >::value; -}; -template struct array_size > { - static const size_t value = std::tuple_size >::value; -}; - -template EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(IndexList& a) { - return IndexTupleExtractor::get_val(a); -} -template EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(const IndexList& a) { - return IndexTupleExtractor::get_val(a); -} - -template -struct index_known_statically_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) { - return false; - } -}; - -template -struct index_known_statically_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i) { - return IndexList().value_known_statically(i); - } -}; - -template -struct index_known_statically_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i) { - return IndexList().value_known_statically(i); - } -}; - - -template -struct all_indices_known_statically_impl { - static constexpr bool run() { - return false; - } -}; - -template -struct all_indices_known_statically_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run() { - return IndexList().all_values_known_statically(); - } -}; - -template -struct all_indices_known_statically_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run() { - return IndexList().all_values_known_statically(); - } -}; - - -template -struct indices_statically_known_to_increase_impl { - EIGEN_DEVICE_FUNC static constexpr bool run() { - return false; - } -}; - -template - struct indices_statically_known_to_increase_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run() { - return Eigen::IndexList().values_statically_known_to_increase(); - } -}; - -template - struct indices_statically_known_to_increase_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run() { - return Eigen::IndexList().values_statically_known_to_increase(); - } -}; - - -template -struct index_statically_eq_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { - return false; - } -}; - -template -struct index_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return IndexList().value_known_statically(i) & - (IndexList().get(i) == value); - } -}; - -template -struct index_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return IndexList().value_known_statically(i) & - (IndexList().get(i) == value); - } -}; - - -template -struct index_statically_ne_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { - return false; - } -}; - -template -struct index_statically_ne_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return IndexList().value_known_statically(i) & - (IndexList().get(i) != value); - } -}; - -template -struct index_statically_ne_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return IndexList().value_known_statically(i) & - (IndexList().get(i) != value); - } -}; - - -template -struct index_statically_gt_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { - return false; - } -}; - -template -struct index_statically_gt_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return IndexList().value_known_statically(i) & - (IndexList().get(i) > value); - } -}; - -template -struct index_statically_gt_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return IndexList().value_known_statically(i) & - (IndexList().get(i) > value); - } -}; - - - -template -struct index_statically_lt_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { - return false; - } -}; - -template -struct index_statically_lt_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return IndexList().value_known_statically(i) & - (IndexList().get(i) < value); - } -}; - -template -struct index_statically_lt_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return IndexList().value_known_statically(i) & - (IndexList().get(i) < value); - } -}; - - - -template -struct index_pair_first_statically_eq_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { - return false; - } -}; - -template -struct index_pair_first_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return IndexPairList().value_known_statically(i) & - (IndexPairList().operator[](i).first == value); - } -}; - -template -struct index_pair_first_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return IndexPairList().value_known_statically(i) & - (IndexPairList().operator[](i).first == value); - } -}; - - - -template -struct index_pair_second_statically_eq_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { - return false; - } -}; - -template -struct index_pair_second_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return IndexPairList().value_known_statically(i) & - (IndexPairList().operator[](i).second == value); - } -}; - -template -struct index_pair_second_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { - return IndexPairList().value_known_statically(i) & - (IndexPairList().operator[](i).second == value); - } -}; - - -} // end namespace internal -} // end namespace Eigen - -#else - -namespace Eigen { -namespace internal { - -template -struct index_known_statically_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex) { - return false; - } -}; - -template -struct all_indices_known_statically_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() { - return false; - } -}; - -template -struct indices_statically_known_to_increase_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() { - return false; - } -}; - -template -struct index_statically_eq_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { - return false; - } -}; - -template -struct index_statically_ne_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { - return false; - } -}; - -template -struct index_statically_gt_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { - return false; - } -}; - -template -struct index_statically_lt_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { - return false; - } -}; - -template -struct index_pair_first_statically_eq_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { - return false; - } -}; - -template -struct index_pair_second_statically_eq_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { - return false; - } -}; - - - -} // end namespace internal -} // end namespace Eigen - -#endif - - -namespace Eigen { -namespace internal { -template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(DenseIndex i) { - return index_known_statically_impl::run(i); -} - -template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool all_indices_known_statically() { - return all_indices_known_statically_impl::run(); -} - -template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool indices_statically_known_to_increase() { - return indices_statically_known_to_increase_impl::run(); -} - -template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(DenseIndex i, DenseIndex value) { - return index_statically_eq_impl::run(i, value); -} - -template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(DenseIndex i, DenseIndex value) { - return index_statically_ne_impl::run(i, value); -} - -template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(DenseIndex i, DenseIndex value) { - return index_statically_gt_impl::run(i, value); -} - -template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(DenseIndex i, DenseIndex value) { - return index_statically_lt_impl::run(i, value); -} - -template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(DenseIndex i, DenseIndex value) { - return index_pair_first_statically_eq_impl::run(i, value); -} - -template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(DenseIndex i, DenseIndex value) { - return index_pair_second_statically_eq_impl::run(i, value); -} - -} // end namespace internal -} // end namespace Eigen - - -#endif // EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h deleted file mode 100644 index f391fb9..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ /dev/null @@ -1,229 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Ke Yang -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H -#define EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H - -namespace Eigen { - -/** \class TensorInflation - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor inflation class. - * - * - */ -namespace internal { -template -struct traits > : public traits -{ - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorInflationOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorInflationOp type; -}; - -} // end namespace internal - -template -class TensorInflationOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorInflationOp(const XprType& expr, const Strides& strides) - : m_xpr(expr), m_strides(strides) {} - - EIGEN_DEVICE_FUNC - const Strides& strides() const { return m_strides; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; - const Strides m_strides; -}; - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorInflationOp XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size::Dimensions>::value; - typedef DSizes Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = /*TensorEvaluator::IsAligned*/ false, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_strides(op.strides()) - { - m_dimensions = m_impl.dimensions(); - // Expand each dimension to the inflated dimension. - for (int i = 0; i < NumDims; ++i) { - m_dimensions[i] = (m_dimensions[i] - 1) * op.strides()[i] + 1; - } - - // Remember the strides for fast division. - for (int i = 0; i < NumDims; ++i) { - m_fastStrides[i] = internal::TensorIntDivisor(m_strides[i]); - } - - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - if (static_cast(Layout) == static_cast(ColMajor)) { - m_outputStrides[0] = 1; - m_inputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - } - } else { // RowMajor - m_outputStrides[NumDims-1] = 1; - m_inputStrides[NumDims-1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; - m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - // Computes the input index given the output index. Returns true if the output - // index doesn't fall into a hole. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool getInputIndex(Index index, Index* inputIndex) const - { - eigen_assert(index < dimensions().TotalSize()); - *inputIndex = 0; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; - if (idx != idx / m_fastStrides[i] * m_strides[i]) { - return false; - } - *inputIndex += idx / m_strides[i] * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - if (index != index / m_fastStrides[0] * m_strides[0]) { - return false; - } - *inputIndex += index / m_strides[0]; - return true; - } else { - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_outputStrides[i]; - if (idx != idx / m_fastStrides[i] * m_strides[i]) { - return false; - } - *inputIndex += idx / m_strides[i] * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - if (index != index / m_fastStrides[NumDims-1] * m_strides[NumDims-1]) { - return false; - } - *inputIndex += index / m_strides[NumDims - 1]; - } - return true; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - Index inputIndex = 0; - if (getInputIndex(index, &inputIndex)) { - return m_impl.coeff(inputIndex); - } else { - return Scalar(0); - } - } - - // TODO(yangke): optimize this function so that we can detect and produce - // all-zero packets - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - for (int i = 0; i < PacketSize; ++i) { - values[i] = coeff(index+i); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - const double compute_cost = NumDims * (3 * TensorOpCost::DivCost() + - 3 * TensorOpCost::MulCost() + - 2 * TensorOpCost::AddCost()); - const double input_size = m_impl.dimensions().TotalSize(); - const double output_size = m_dimensions.TotalSize(); - if (output_size == 0) - return TensorOpCost(); - return m_impl.costPerCoeff(vectorized) + - TensorOpCost(sizeof(CoeffReturnType) * input_size / output_size, 0, - compute_cost, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - - protected: - Dimensions m_dimensions; - array m_outputStrides; - array m_inputStrides; - TensorEvaluator m_impl; - const Strides m_strides; - array, NumDims> m_fastStrides; -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h deleted file mode 100644 index 33edc49..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h +++ /dev/null @@ -1,82 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H -#define EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H - -#if EIGEN_HAS_VARIADIC_TEMPLATES - -#include - -namespace Eigen { - -/** \class TensorInitializer - * \ingroup CXX11_Tensor_Module - * - * \brief Helper template to initialize Tensors from std::initializer_lists. - */ -namespace internal { - -template -struct Initializer { - typedef std::initializer_list< - typename Initializer::InitList> InitList; - - static void run(TensorEvaluator& tensor, - Eigen::array::Index, traits::NumDimensions>* indices, - const InitList& vals) { - int i = 0; - for (auto v : vals) { - (*indices)[traits::NumDimensions - N] = i++; - Initializer::run(tensor, indices, v); - } - } -}; - -template -struct Initializer { - typedef std::initializer_list::Scalar> InitList; - - static void run(TensorEvaluator& tensor, - Eigen::array::Index, traits::NumDimensions>* indices, - const InitList& vals) { - int i = 0; - // There is likely a faster way to do that than iterating. - for (auto v : vals) { - (*indices)[traits::NumDimensions - 1] = i++; - tensor.coeffRef(*indices) = v; - } - } -}; - -template -struct Initializer { - typedef typename traits::Scalar InitList; - - static void run(TensorEvaluator& tensor, - Eigen::array::Index, traits::NumDimensions>*, - const InitList& v) { - tensor.coeffRef(0) = v; - } -}; - - -template -void initialize_tensor(TensorEvaluator& tensor, - const typename Initializer::NumDimensions>::InitList& vals) { - Eigen::array::Index, traits::NumDimensions> indices; - Initializer::NumDimensions>::run(tensor, &indices, vals); -} - -} // namespace internal -} // namespace Eigen - -#endif // EIGEN_HAS_VARIADIC_TEMPLATES - -#endif // EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h deleted file mode 100644 index ede3939..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ /dev/null @@ -1,253 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H -#define EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H - - -namespace Eigen { - -/** \internal - * - * \class TensorIntDiv - * \ingroup CXX11_Tensor_Module - * - * \brief Fast integer division by a constant. - * - * See the paper from Granlund and Montgomery for explanation. - * (at http://dx.doi.org/10.1145/773473.178249) - * - * \sa Tensor - */ - -namespace internal { - -namespace { - - // Note: result is undefined if val == 0 - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - typename internal::enable_if::type count_leading_zeros(const T val) - { -#ifdef __CUDA_ARCH__ - return __clz(val); -#elif EIGEN_COMP_MSVC - unsigned long index; - _BitScanReverse(&index, val); - return 31 - index; -#else - EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE); - return __builtin_clz(static_cast(val)); -#endif - } - - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - typename internal::enable_if::type count_leading_zeros(const T val) - { -#ifdef __CUDA_ARCH__ - return __clzll(val); -#elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64 - unsigned long index; - _BitScanReverse64(&index, val); - return 63 - index; -#elif EIGEN_COMP_MSVC - // MSVC's _BitScanReverse64 is not available for 32bits builds. - unsigned int lo = (unsigned int)(val&0xffffffff); - unsigned int hi = (unsigned int)((val>>32)&0xffffffff); - int n; - if(hi==0) - n = 32 + count_leading_zeros(lo); - else - n = count_leading_zeros(hi); - return n; -#else - EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE); - return __builtin_clzll(static_cast(val)); -#endif - } - - template - struct UnsignedTraits { - typedef typename conditional::type type; - }; - - template - struct DividerTraits { - typedef typename UnsignedTraits::type type; - static const int N = sizeof(T) * 8; - }; - - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) { -#if defined(__CUDA_ARCH__) - return __umulhi(a, b); -#else - return (static_cast(a) * b) >> 32; -#endif - } - - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { -#if defined(__CUDA_ARCH__) - return __umul64hi(a, b); -#elif defined(__SIZEOF_INT128__) - __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b); - return static_cast(v >> 64); -#else - return (TensorUInt128, uint64_t>(a) * TensorUInt128, uint64_t>(b)).upper(); -#endif - } - - template - struct DividerHelper { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier(const int log_div, const T divider) { - EIGEN_STATIC_ASSERT(N == 32, YOU_MADE_A_PROGRAMMING_MISTAKE); - return static_cast((static_cast(1) << (N+log_div)) / divider - (static_cast(1) << N) + 1); - } - }; - - template - struct DividerHelper<64, T> { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) { -#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__) - return static_cast((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1); -#else - const uint64_t shift = 1ULL << log_div; - TensorUInt128 result = TensorUInt128 >(shift, 0) / TensorUInt128, uint64_t>(divider) - - TensorUInt128, static_val<0> >(1, 0) - + TensorUInt128, static_val<1> >(1); - return static_cast(result); -#endif - } - }; -} - - -template -struct TensorIntDivisor { - public: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() { - multiplier = 0; - shift1 = 0; - shift2 = 0; - } - - // Must have 0 < divider < 2^31. This is relaxed to - // 0 < divider < 2^63 when using 64-bit indices on platforms that support - // the __uint128_t type. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) { - const int N = DividerTraits::N; - eigen_assert(static_cast::type>(divider) < NumTraits::highest()/2); - eigen_assert(divider > 0); - - // fast ln2 - const int leading_zeros = count_leading_zeros(static_cast(divider)); - int log_div = N - leading_zeros; - // if divider is a power of two then log_div is 1 more than it should be. - if ((static_cast::type>(1) << (log_div-1)) == static_cast::type>(divider)) - log_div--; - - multiplier = DividerHelper::computeMultiplier(log_div, divider); - shift1 = log_div > 1 ? 1 : log_div; - shift2 = log_div > 1 ? log_div-1 : 0; - } - - // Must have 0 <= numerator. On platforms that dont support the __uint128_t - // type numerator should also be less than 2^32-1. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const { - eigen_assert(static_cast::type>(numerator) < NumTraits::highest()/2); - //eigen_assert(numerator >= 0); // this is implicitly asserted by the line above - - UnsignedType t1 = muluh(multiplier, numerator); - UnsignedType t = (static_cast(numerator) - t1) >> shift1; - return (t1 + t) >> shift2; - } - - private: - typedef typename DividerTraits::type UnsignedType; - UnsignedType multiplier; - int32_t shift1; - int32_t shift2; -}; - - -// Optimized version for signed 32 bit integers. -// Derived from Hacker's Delight. -// Only works for divisors strictly greater than one -template <> -class TensorIntDivisor { - public: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() { - magic = 0; - shift = 0; - } - // Must have 2 <= divider - EIGEN_DEVICE_FUNC TensorIntDivisor(int32_t divider) { - eigen_assert(divider >= 2); - calcMagic(divider); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const { -#ifdef __CUDA_ARCH__ - return (__umulhi(magic, n) >> shift); -#else - uint64_t v = static_cast(magic) * static_cast(n); - return (static_cast(v >> 32) >> shift); -#endif - } - -private: - // Compute the magic numbers. See Hacker's Delight section 10 for an in - // depth explanation. - EIGEN_DEVICE_FUNC void calcMagic(int32_t d) { - const unsigned two31 = 0x80000000; // 2**31. - unsigned ad = d; - unsigned t = two31 + (ad >> 31); - unsigned anc = t - 1 - t%ad; // Absolute value of nc. - int p = 31; // Init. p. - unsigned q1 = two31/anc; // Init. q1 = 2**p/|nc|. - unsigned r1 = two31 - q1*anc; // Init. r1 = rem(2**p, |nc|). - unsigned q2 = two31/ad; // Init. q2 = 2**p/|d|. - unsigned r2 = two31 - q2*ad; // Init. r2 = rem(2**p, |d|). - unsigned delta = 0; - do { - p = p + 1; - q1 = 2*q1; // Update q1 = 2**p/|nc|. - r1 = 2*r1; // Update r1 = rem(2**p, |nc|). - if (r1 >= anc) { // (Must be an unsigned - q1 = q1 + 1; // comparison here). - r1 = r1 - anc;} - q2 = 2*q2; // Update q2 = 2**p/|d|. - r2 = 2*r2; // Update r2 = rem(2**p, |d|). - if (r2 >= ad) { // (Must be an unsigned - q2 = q2 + 1; // comparison here). - r2 = r2 - ad;} - delta = ad - r2; - } while (q1 < delta || (q1 == delta && r1 == 0)); - - magic = (unsigned)(q2 + 1); - shift = p - 32; - } - - uint32_t magic; - int32_t shift; -}; - - -template -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor& divisor) { - return divisor.divide(numerator); -} - - -} // end namespace internal -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h deleted file mode 100644 index cd0109e..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ /dev/null @@ -1,209 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H -#define EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H - -namespace Eigen { - -/** \class TensorLayoutSwap - * \ingroup CXX11_Tensor_Module - * - * \brief Swap the layout from col-major to row-major, or row-major - * to col-major, and invert the order of the dimensions. - * - * Beware: the dimensions are reversed by this operation. If you want to - * preserve the ordering of the dimensions, you need to combine this - * operation with a shuffle. - * - * \example: - * Tensor input(2, 4); - * Tensor output = input.swap_layout(); - * eigen_assert(output.dimension(0) == 4); - * eigen_assert(output.dimension(1) == 2); - * - * array shuffle(1, 0); - * output = input.swap_layout().shuffle(shuffle); - * eigen_assert(output.dimension(0) == 2); - * eigen_assert(output.dimension(1) == 4); - * - */ -namespace internal { -template -struct traits > : public traits -{ - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = traits::NumDimensions; - static const int Layout = (traits::Layout == ColMajor) ? RowMajor : ColMajor; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorLayoutSwapOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorLayoutSwapOp type; -}; - -} // end namespace internal - - - -template -class TensorLayoutSwapOp : public TensorBase, WriteAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename internal::remove_const::type CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr) - : m_xpr(expr) {} - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const TensorLayoutSwapOp& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - protected: - typename XprType::Nested m_xpr; -}; - - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorLayoutSwapOp XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size::Dimensions>::value; - typedef DSizes Dimensions; - - enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, - CoordAccess = false, // to be implemented - RawAccess = TensorEvaluator::RawAccess - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device) - { - for(int i = 0; i < NumDims; ++i) { - m_dimensions[i] = m_impl.dimensions()[NumDims-1-i]; - } - } - - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { - return m_impl.evalSubExprsIfNeeded(data); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return m_impl.coeff(index); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return m_impl.template packet(index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return m_impl.costPerCoeff(vectorized); - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); } - - const TensorEvaluator& impl() const { return m_impl; } - - protected: - TensorEvaluator m_impl; - Dimensions m_dimensions; -}; - - -// Eval as lvalue -template - struct TensorEvaluator, Device> - : public TensorEvaluator, Device> -{ - typedef TensorEvaluator, Device> Base; - typedef TensorLayoutSwapOp XprType; - - enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, - CoordAccess = false // to be implemented - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : Base(op, device) - { } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) - { - return this->m_impl.coeffRef(index); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) - { - this->m_impl.template writePacket(index, x); - } -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h deleted file mode 100644 index ee0078b..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h +++ /dev/null @@ -1,54 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H -#define EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H - - -/** use this macro in sfinae selection in templated functions - * - * template::value , int >::type = 0 - * > - * void foo(){} - * - * becomes => - * - * template::value ) - * > - * void foo(){} - */ - -// SFINAE requires variadic templates -#ifndef __CUDACC__ -#if EIGEN_HAS_VARIADIC_TEMPLATES - // SFINAE doesn't work for gcc <= 4.7 - #ifdef EIGEN_COMP_GNUC - #if EIGEN_GNUC_AT_LEAST(4,8) - #define EIGEN_HAS_SFINAE - #endif - #else - #define EIGEN_HAS_SFINAE - #endif -#endif -#endif - -#define EIGEN_SFINAE_ENABLE_IF( __condition__ ) \ - typename internal::enable_if< ( __condition__ ) , int >::type = 0 - - -#if EIGEN_HAS_CONSTEXPR -#define EIGEN_CONSTEXPR constexpr -#else -#define EIGEN_CONSTEXPR -#endif - - -#endif diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h deleted file mode 100644 index e4fc86a..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ /dev/null @@ -1,323 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_MAP_H -#define EIGEN_CXX11_TENSOR_TENSOR_MAP_H - -namespace Eigen { - -// FIXME use proper doxygen documentation (e.g. \tparam MakePointer_) - -/** \class TensorMap - * \ingroup CXX11_Tensor_Module - * - * \brief A tensor expression mapping an existing array of data. - * - */ -/// `template class MakePointer_` is added to convert the host pointer to the device pointer. -/// It is added due to the fact that for our device compiler `T*` is not allowed. -/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer `T`. -/// This is done through our `MakePointer_` class. By default the Type in the `MakePointer_` is `T*` . -/// Therefore, by adding the default value, we managed to convert the type and it does not break any -/// existing code as its default value is `T*`. -template class MakePointer_> class TensorMap : public TensorBase > -{ - public: - typedef TensorMap Self; - typedef typename PlainObjectType::Base Base; - typedef typename Eigen::internal::nested::type Nested; - typedef typename internal::traits::StorageKind StorageKind; - typedef typename internal::traits::Index Index; - typedef typename internal::traits::Scalar Scalar; - typedef typename NumTraits::Real RealScalar; - typedef typename Base::CoeffReturnType CoeffReturnType; - - /* typedef typename internal::conditional< - bool(internal::is_lvalue::value), - Scalar *, - const Scalar *>::type - PointerType;*/ - typedef typename MakePointer_::Type PointerType; - typedef PointerType PointerArgType; - - static const int Options = Options_; - - static const Index NumIndices = PlainObjectType::NumIndices; - typedef typename PlainObjectType::Dimensions Dimensions; - - enum { - IsAligned = ((int(Options_)&Aligned)==Aligned), - Layout = PlainObjectType::Layout, - CoordAccess = true, - RawAccess = true - }; - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr) : m_data(dataPtr), m_dimensions() { - // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) { - // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) - } -#else - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) { - // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. - EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) { - EIGEN_STATIC_ASSERT(2 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) { - EIGEN_STATIC_ASSERT(3 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) { - EIGEN_STATIC_ASSERT(4 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) { - EIGEN_STATIC_ASSERT(5 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) - } -#endif - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const array& dimensions) - : m_data(dataPtr), m_dimensions(dimensions) - { } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const Dimensions& dimensions) - : m_data(dataPtr), m_dimensions(dimensions) - { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PlainObjectType& tensor) - : m_data(tensor.data()), m_dimensions(tensor.dimensions()) - { } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index rank() const { return m_dimensions.rank(); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_dimensions[n]; } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE PointerType data() { return m_data; } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const PointerType data() const { return m_data; } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const - { - // eigen_assert(checkIndexRange(indices)); - if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(indices); - return m_data[index]; - } else { - const Index index = m_dimensions.IndexOfColMajor(indices); - return m_data[index]; - } - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()() const - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE) - return m_data[0]; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const - { - eigen_internal_assert(index >= 0 && index < size()); - return m_data[index]; - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const - { - EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, secondIndex, otherIndices...}}); - return m_data[index]; - } else { - const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, secondIndex, otherIndices...}}); - return m_data[index]; - } - } -#else - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const - { - if (PlainObjectType::Options&RowMajor) { - const Index index = i1 + i0 * m_dimensions[1]; - return m_data[index]; - } else { - const Index index = i0 + i1 * m_dimensions[0]; - return m_data[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const - { - if (PlainObjectType::Options&RowMajor) { - const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0); - return m_data[index]; - } else { - const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2); - return m_data[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const - { - if (PlainObjectType::Options&RowMajor) { - const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)); - return m_data[index]; - } else { - const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3)); - return m_data[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const - { - if (PlainObjectType::Options&RowMajor) { - const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0))); - return m_data[index]; - } else { - const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4))); - return m_data[index]; - } - } -#endif - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) - { - // eigen_assert(checkIndexRange(indices)); - if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(indices); - return m_data[index]; - } else { - const Index index = m_dimensions.IndexOfColMajor(indices); - return m_data[index]; - } - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()() - { - EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE) - return m_data[0]; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index index) - { - eigen_internal_assert(index >= 0 && index < size()); - return m_data[index]; - } - -#if EIGEN_HAS_VARIADIC_TEMPLATES - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) - { - static_assert(sizeof...(otherIndices) + 2 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); - const std::size_t NumDims = sizeof...(otherIndices) + 2; - if (PlainObjectType::Options&RowMajor) { - const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, secondIndex, otherIndices...}}); - return m_data[index]; - } else { - const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, secondIndex, otherIndices...}}); - return m_data[index]; - } - } -#else - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) - { - if (PlainObjectType::Options&RowMajor) { - const Index index = i1 + i0 * m_dimensions[1]; - return m_data[index]; - } else { - const Index index = i0 + i1 * m_dimensions[0]; - return m_data[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2) - { - if (PlainObjectType::Options&RowMajor) { - const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0); - return m_data[index]; - } else { - const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2); - return m_data[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) - { - if (PlainObjectType::Options&RowMajor) { - const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)); - return m_data[index]; - } else { - const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3)); - return m_data[index]; - } - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) - { - if (PlainObjectType::Options&RowMajor) { - const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0))); - return m_data[index]; - } else { - const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4))); - return m_data[index]; - } - } -#endif - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self& operator=(const Self& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Self& operator=(const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - private: - typename MakePointer_::Type m_data; - Dimensions m_dimensions; -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_MAP_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h deleted file mode 100644 index 615559d..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ /dev/null @@ -1,218 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2015 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_H -#define EIGEN_CXX11_TENSOR_TENSOR_META_H - -namespace Eigen { - -template struct Cond {}; - -template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -const T1& choose(Cond, const T1& first, const T2&) { - return first; -} - -template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -const T2& choose(Cond, const T1&, const T2& second) { - return second; -} - - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -T divup(const X x, const Y y) { - return static_cast((x + y - 1) / y); -} - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -T divup(const T x, const T y) { - return static_cast((x + y - 1) / y); -} - -template struct max_n_1 { - static const size_t size = n; -}; -template <> struct max_n_1<0> { - static const size_t size = 1; -}; - - -// Default packet types -template -struct PacketType : internal::packet_traits { - typedef typename internal::packet_traits::type type; -}; - -// For CUDA packet types when using a GpuDevice -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16) -template <> -struct PacketType { - typedef half2 type; - static const int size = 2; - enum { - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasNegate = 1, - HasAbs = 1, - HasArg = 0, - HasAbs2 = 0, - HasMin = 1, - HasMax = 1, - HasConj = 0, - HasSetLinear = 0, - HasBlend = 0, - - HasDiv = 1, - HasSqrt = 1, - HasRsqrt = 1, - HasExp = 1, - HasLog = 1, - HasLog1p = 0, - HasLog10 = 0, - HasPow = 1, - }; -}; -#endif - -#if defined(EIGEN_USE_SYCL) -template - struct PacketType { - typedef T type; - static const int size = 1; - enum { - HasAdd = 0, - HasSub = 0, - HasMul = 0, - HasNegate = 0, - HasAbs = 0, - HasArg = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasConj = 0, - HasSetLinear = 0, - HasBlend = 0 - }; -}; -#endif - - -// Tuple mimics std::pair but works on e.g. nvcc. -template struct Tuple { - public: - U first; - V second; - - typedef U first_type; - typedef V second_type; - - EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Tuple() : first(), second() {} - - EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Tuple(const U& f, const V& s) : first(f), second(s) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Tuple& operator= (const Tuple& rhs) { - if (&rhs == this) return *this; - first = rhs.first; - second = rhs.second; - return *this; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void swap(Tuple& rhs) { - using numext::swap; - swap(first, rhs.first); - swap(second, rhs.second); - } -}; - -template -EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -bool operator==(const Tuple& x, const Tuple& y) { - return (x.first == y.first && x.second == y.second); -} - -template -EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -bool operator!=(const Tuple& x, const Tuple& y) { - return !(x == y); -} - - -// Can't use std::pairs on cuda devices -template struct IndexPair { - EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) {} - EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Idx f, Idx s) : first(f), second(s) {} - - EIGEN_DEVICE_FUNC void set(IndexPair val) { - first = val.first; - second = val.second; - } - - Idx first; - Idx second; -}; - - -#ifdef EIGEN_HAS_SFINAE -namespace internal { - - template - EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - array customIndices2Array(IndexType& idx, numeric_list) { - return { idx[Is]... }; - } - template - EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - array customIndices2Array(IndexType&, numeric_list) { - return array(); - } - - /** Make an array (for index/dimensions) out of a custom index */ - template - EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - array customIndices2Array(IndexType& idx) { - return customIndices2Array(idx, typename gen_numeric_list::type{}); - } - - - template - struct is_base_of - { - - typedef char (&yes)[1]; - typedef char (&no)[2]; - - template - struct Host - { - operator BB*() const; - operator DD*(); - }; - - template - static yes check(D*, T); - static no check(B*, int); - - static const bool value = sizeof(check(Host(), int())) == sizeof(yes); - }; - -} -#endif - - - -} // namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_META_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h deleted file mode 100644 index d34f1e3..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ /dev/null @@ -1,888 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H -#define EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H - -namespace Eigen { - -/** \class TensorReshaping - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor reshaping class. - * - * - */ -namespace internal { -template -struct traits > : public traits -{ - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = array_size::value; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorReshapingOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorReshapingOp type; -}; - -} // end namespace internal - - - -template -class TensorReshapingOp : public TensorBase, WriteAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename internal::remove_const::type CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp(const XprType& expr, const NewDimensions& dims) - : m_xpr(expr), m_dims(dims) {} - - EIGEN_DEVICE_FUNC - const NewDimensions& dimensions() const { return m_dims; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const TensorReshapingOp& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorReshapingOp& operator = (const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - protected: - typename XprType::Nested m_xpr; - const NewDimensions m_dims; -}; - - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorReshapingOp XprType; - typedef NewDimensions Dimensions; - - enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = TensorEvaluator::RawAccess - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_dimensions(op.dimensions()) - { - // The total size of the reshaped tensor must be equal to the total size - // of the input tensor. - eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions())); - } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { - return m_impl.evalSubExprsIfNeeded(data); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return m_impl.coeff(index); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - return m_impl.template packet(index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return m_impl.costPerCoeff(vectorized); - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast(m_impl.data()); } - - EIGEN_DEVICE_FUNC const TensorEvaluator& impl() const { return m_impl; } - - protected: - TensorEvaluator m_impl; - NewDimensions m_dimensions; -}; - - -// Eval as lvalue -template - struct TensorEvaluator, Device> - : public TensorEvaluator, Device> - -{ - typedef TensorEvaluator, Device> Base; - typedef TensorReshapingOp XprType; - typedef NewDimensions Dimensions; - - enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = TensorEvaluator::RawAccess - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : Base(op, device) - { } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) - { - return this->m_impl.coeffRef(index); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) - { - this->m_impl.template writePacket(index, x); - } -}; - - -/** \class TensorSlicing - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor slicing class. - * - * - */ -namespace internal { -template -struct traits > : public traits -{ - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = array_size::value; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorSlicingOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorSlicingOp type; -}; - -} // end namespace internal - - - -template -class TensorSlicingOp : public TensorBase > -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSlicingOp(const XprType& expr, const StartIndices& indices, const Sizes& sizes) - : m_xpr(expr), m_indices(indices), m_sizes(sizes) {} - - EIGEN_DEVICE_FUNC - const StartIndices& startIndices() const { return m_indices; } - EIGEN_DEVICE_FUNC - const Sizes& sizes() const { return m_sizes; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorSlicingOp& operator = (const TensorSlicingOp& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run(assign, DefaultDevice()); - return *this; - } - - - protected: - typename XprType::Nested m_xpr; - const StartIndices m_indices; - const Sizes m_sizes; -}; - - -// Fixme: figure out the exact threshold -namespace { -template struct MemcpyTriggerForSlicing { - EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const Device& device) : threshold_(2 * device.numThreads()) { } - EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > threshold_; } - - private: - Index threshold_; -}; - -// It is very expensive to start the memcpy kernel on GPU: we therefore only -// use it for large copies. -#ifdef EIGEN_USE_GPU -template struct MemcpyTriggerForSlicing { - EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const GpuDevice&) { } - EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; } -}; -#endif -} - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorSlicingOp XprType; - static const int NumDims = internal::array_size::value; - - enum { - // Alignment can't be guaranteed at compile time since it depends on the - // slice offsets and sizes. - IsAligned = /*TensorEvaluator::IsAligned*/false, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) - { - for (std::size_t i = 0; i < internal::array_size::value; ++i) { - eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]); - } - - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - const Sizes& output_dims = op.sizes(); - if (static_cast(Layout) == static_cast(ColMajor)) { - m_inputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - } - - // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed. - m_outputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; - m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); - } - } else { - m_inputStrides[NumDims-1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; - } - - // Don't initialize m_fastOutputStrides[NumDims-1] since it won't ever be accessed. - m_outputStrides[NumDims-1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1]; - m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); - } - } - } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef Sizes Dimensions; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { - m_impl.evalSubExprsIfNeeded(NULL); - if (!NumTraits::type>::RequireInitialization && data && m_impl.data()) { - Index contiguous_values = 1; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = 0; i < NumDims; ++i) { - contiguous_values *= dimensions()[i]; - if (dimensions()[i] != m_impl.dimensions()[i]) { - break; - } - } - } else { - for (int i = NumDims-1; i >= 0; --i) { - contiguous_values *= dimensions()[i]; - if (dimensions()[i] != m_impl.dimensions()[i]) { - break; - } - } - } - // Use memcpy if it's going to be faster than using the regular evaluation. - const MemcpyTriggerForSlicing trigger(m_device); - if (trigger(contiguous_values)) { - Scalar* src = (Scalar*)m_impl.data(); - for (int i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { - Index offset = srcCoeff(i); - m_device.memcpy((void*)(data+i), src+offset, contiguous_values * sizeof(Scalar)); - } - return false; - } - } - return true; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return m_impl.coeff(srcCoeff(index)); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - const int packetSize = internal::unpacket_traits::size; - EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+packetSize-1 < internal::array_prod(dimensions())); - - Index inputIndices[] = {0, 0}; - Index indices[] = {index, index + packetSize - 1}; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / m_fastOutputStrides[i]; - const Index idx1 = indices[1] / m_fastOutputStrides[i]; - inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; - inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; - indices[0] -= idx0 * m_outputStrides[i]; - indices[1] -= idx1 * m_outputStrides[i]; - } - inputIndices[0] += (indices[0] + m_offsets[0]); - inputIndices[1] += (indices[1] + m_offsets[0]); - } else { - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx0 = indices[0] / m_fastOutputStrides[i]; - const Index idx1 = indices[1] / m_fastOutputStrides[i]; - inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; - inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; - indices[0] -= idx0 * m_outputStrides[i]; - indices[1] -= idx1 * m_outputStrides[i]; - } - inputIndices[0] += (indices[0] + m_offsets[NumDims-1]); - inputIndices[1] += (indices[1] + m_offsets[NumDims-1]); - } - if (inputIndices[1] - inputIndices[0] == packetSize - 1) { - PacketReturnType rslt = m_impl.template packet(inputIndices[0]); - return rslt; - } - else { - EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; - values[0] = m_impl.coeff(inputIndices[0]); - values[packetSize-1] = m_impl.coeff(inputIndices[1]); - for (int i = 1; i < packetSize-1; ++i) { - values[i] = coeff(index+i); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims); - } - - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { - Scalar* result = m_impl.data(); - if (result) { - Index offset = 0; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = 0; i < NumDims; ++i) { - if (m_dimensions[i] != m_impl.dimensions()[i]) { - offset += m_offsets[i] * m_inputStrides[i]; - for (int j = i+1; j < NumDims; ++j) { - if (m_dimensions[j] > 1) { - return NULL; - } - offset += m_offsets[j] * m_inputStrides[j]; - } - break; - } - } - } else { - for (int i = NumDims - 1; i >= 0; --i) { - if (m_dimensions[i] != m_impl.dimensions()[i]) { - offset += m_offsets[i] * m_inputStrides[i]; - for (int j = i-1; j >= 0; --j) { - if (m_dimensions[j] > 1) { - return NULL; - } - offset += m_offsets[j] * m_inputStrides[j]; - } - break; - } - } - } - return result + offset; - } - return NULL; - } - - protected: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const - { - Index inputIndex = 0; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_fastOutputStrides[i]; - inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - inputIndex += (index + m_offsets[0]); - } else { - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_fastOutputStrides[i]; - inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - inputIndex += (index + m_offsets[NumDims-1]); - } - return inputIndex; - } - - array m_outputStrides; - array, NumDims> m_fastOutputStrides; - array m_inputStrides; - TensorEvaluator m_impl; - const Device& m_device; - Dimensions m_dimensions; - const StartIndices m_offsets; -}; - - -// Eval as lvalue -template -struct TensorEvaluator, Device> - : public TensorEvaluator, Device> -{ - typedef TensorEvaluator, Device> Base; - typedef TensorSlicingOp XprType; - static const int NumDims = internal::array_size::value; - - enum { - IsAligned = /*TensorEvaluator::IsAligned*/false, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : Base(op, device) - { } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef Sizes Dimensions; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) - { - return this->m_impl.coeffRef(this->srcCoeff(index)); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void writePacket(Index index, const PacketReturnType& x) - { - const int packetSize = internal::unpacket_traits::size; - Index inputIndices[] = {0, 0}; - Index indices[] = {index, index + packetSize - 1}; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; - const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; - inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i]; - inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i]; - indices[0] -= idx0 * this->m_outputStrides[i]; - indices[1] -= idx1 * this->m_outputStrides[i]; - } - inputIndices[0] += (indices[0] + this->m_offsets[0]); - inputIndices[1] += (indices[1] + this->m_offsets[0]); - } else { - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; - const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; - inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i]; - inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i]; - indices[0] -= idx0 * this->m_outputStrides[i]; - indices[1] -= idx1 * this->m_outputStrides[i]; - } - inputIndices[0] += (indices[0] + this->m_offsets[NumDims-1]); - inputIndices[1] += (indices[1] + this->m_offsets[NumDims-1]); - } - if (inputIndices[1] - inputIndices[0] == packetSize - 1) { - this->m_impl.template writePacket(inputIndices[0], x); - } - else { - EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; - internal::pstore(values, x); - this->m_impl.coeffRef(inputIndices[0]) = values[0]; - this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; - for (int i = 1; i < packetSize-1; ++i) { - this->coeffRef(index+i) = values[i]; - } - } - } -}; - - - -namespace internal { -template -struct traits > : public traits -{ - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = array_size::value; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorStridingSlicingOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorStridingSlicingOp type; -}; - -} // end namespace internal - - -template -class TensorStridingSlicingOp : public TensorBase > -{ - public: - typedef typename internal::traits::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename internal::nested::type Nested; - typedef typename internal::traits::StorageKind StorageKind; - typedef typename internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingSlicingOp( - const XprType& expr, const StartIndices& startIndices, - const StopIndices& stopIndices, const Strides& strides) - : m_xpr(expr), m_startIndices(startIndices), m_stopIndices(stopIndices), - m_strides(strides) {} - - EIGEN_DEVICE_FUNC - const StartIndices& startIndices() const { return m_startIndices; } - EIGEN_DEVICE_FUNC - const StartIndices& stopIndices() const { return m_stopIndices; } - EIGEN_DEVICE_FUNC - const StartIndices& strides() const { return m_strides; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const TensorStridingSlicingOp& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run( - assign, DefaultDevice()); - return *this; - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const OtherDerived& other) - { - typedef TensorAssignOp Assign; - Assign assign(*this, other); - internal::TensorExecutor::run( - assign, DefaultDevice()); - return *this; - } - - protected: - typename XprType::Nested m_xpr; - const StartIndices m_startIndices; - const StopIndices m_stopIndices; - const Strides m_strides; -}; - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorStridingSlicingOp XprType; - static const int NumDims = internal::array_size::value; - - enum { - // Alignment can't be guaranteed at compile time since it depends on the - // slice offsets and sizes. - IsAligned = false, - PacketAccess = false, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_device(device), m_strides(op.strides()) - { - // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero - DSizes startIndicesClamped, stopIndicesClamped; - for (size_t i = 0; i < internal::array_size::value; ++i) { - eigen_assert(m_strides[i] != 0 && "0 stride is invalid"); - if(m_strides[i]>0){ - startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]); - stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]); - }else{ - /* implies m_strides[i]<0 by assert */ - startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1); - stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1); - } - m_startIndices[i] = startIndicesClamped[i]; - } - - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - - // check for degenerate intervals and compute output tensor shape - bool degenerate = false;; - for(int i = 0; i < NumDims; i++){ - Index interval = stopIndicesClamped[i] - startIndicesClamped[i]; - if(interval == 0 || ((interval<0) != (m_strides[i]<0))){ - m_dimensions[i] = 0; - degenerate = true; - }else{ - m_dimensions[i] = interval / m_strides[i] - + (interval % m_strides[i] != 0 ? 1 : 0); - eigen_assert(m_dimensions[i] >= 0); - } - } - Strides output_dims = m_dimensions; - - if (static_cast(Layout) == static_cast(ColMajor)) { - m_inputStrides[0] = m_strides[0]; - m_offsets[0] = startIndicesClamped[0]; - Index previousDimProduct = 1; - for (int i = 1; i < NumDims; ++i) { - previousDimProduct *= input_dims[i-1]; - m_inputStrides[i] = previousDimProduct * m_strides[i]; - m_offsets[i] = startIndicesClamped[i] * previousDimProduct; - } - - // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed. - m_outputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; - // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash - m_fastOutputStrides[i] = internal::TensorIntDivisor(degenerate ? 1 : m_outputStrides[i]); - } - } else { - m_inputStrides[NumDims-1] = m_strides[NumDims-1]; - m_offsets[NumDims-1] = startIndicesClamped[NumDims-1]; - Index previousDimProduct = 1; - for (int i = NumDims - 2; i >= 0; --i) { - previousDimProduct *= input_dims[i+1]; - m_inputStrides[i] = previousDimProduct * m_strides[i]; - m_offsets[i] = startIndicesClamped[i] * previousDimProduct; - } - - m_outputStrides[NumDims-1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1]; - // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash - m_fastOutputStrides[i] = internal::TensorIntDivisor(degenerate ? 1 : m_outputStrides[i]); - } - } - m_block_total_size_max = numext::maxi(static_cast(1), - device.lastLevelCacheSize() / - sizeof(Scalar)); - } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename internal::remove_const::type ScalarNonConst; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef Strides Dimensions; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return m_impl.coeff(srcCoeff(index)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { - return NULL; - } - - protected: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const - { - Index inputIndex = 0; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 1; i >= 0; --i) { - const Index idx = index / m_fastOutputStrides[i]; - inputIndex += idx * m_inputStrides[i] + m_offsets[i]; - index -= idx * m_outputStrides[i]; - } - } else { - for (int i = 0; i < NumDims; ++i) { - const Index idx = index / m_fastOutputStrides[i]; - inputIndex += idx * m_inputStrides[i] + m_offsets[i]; - index -= idx * m_outputStrides[i]; - } - } - return inputIndex; - } - - static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) { - return numext::maxi(min, numext::mini(max,value)); - } - - array m_outputStrides; - array, NumDims> m_fastOutputStrides; - array m_inputStrides; - TensorEvaluator m_impl; - const Device& m_device; - DSizes m_startIndices; // clamped startIndices - DSizes m_dimensions; - DSizes m_offsets; // offset in a flattened shape - const Strides m_strides; - std::size_t m_block_total_size_max; -}; - -// Eval as lvalue -template -struct TensorEvaluator, Device> - : public TensorEvaluator, Device> -{ - typedef TensorEvaluator, Device> Base; - typedef TensorStridingSlicingOp XprType; - static const int NumDims = internal::array_size::value; - - enum { - IsAligned = false, - PacketAccess = false, - BlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = TensorEvaluator::CoordAccess, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : Base(op, device) - { } - - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename internal::remove_const::type ScalarNonConst; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef Strides Dimensions; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) - { - return this->m_impl.coeffRef(this->srcCoeff(index)); - } -}; - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h deleted file mode 100644 index 647bcf1..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ /dev/null @@ -1,397 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_PADDING_H -#define EIGEN_CXX11_TENSOR_TENSOR_PADDING_H - -namespace Eigen { - -/** \class TensorPadding - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor padding class. - * At the moment only padding with a constant value is supported. - * - */ -namespace internal { -template -struct traits > : public traits -{ - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorPaddingOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorPaddingOp type; -}; - -} // end namespace internal - - - -template -class TensorPaddingOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims, const Scalar padding_value) - : m_xpr(expr), m_padding_dims(padding_dims), m_padding_value(padding_value) {} - - EIGEN_DEVICE_FUNC - const PaddingDimensions& padding() const { return m_padding_dims; } - EIGEN_DEVICE_FUNC - Scalar padding_value() const { return m_padding_value; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; - const PaddingDimensions m_padding_dims; - const Scalar m_padding_value; -}; - - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorPaddingOp XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size::value; - typedef DSizes Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = true, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = true, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()) - { - // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead - // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector - // of 1 element first and then pad. - EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); - - // Compute dimensions - m_dimensions = m_impl.dimensions(); - for (int i = 0; i < NumDims; ++i) { - m_dimensions[i] += m_padding[i].first + m_padding[i].second; - } - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - if (static_cast(Layout) == static_cast(ColMajor)) { - m_inputStrides[0] = 1; - m_outputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; - } - m_outputStrides[NumDims] = m_outputStrides[NumDims-1] * m_dimensions[NumDims-1]; - } else { - m_inputStrides[NumDims - 1] = 1; - m_outputStrides[NumDims] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; - m_outputStrides[i+1] = m_outputStrides[i+2] * m_dimensions[i+1]; - } - m_outputStrides[0] = m_outputStrides[1] * m_dimensions[0]; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - eigen_assert(index < dimensions().TotalSize()); - Index inputIndex = 0; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_outputStrides[i]; - if (isPaddingAtIndexForDim(idx, i)) { - return m_paddingValue; - } - inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - if (isPaddingAtIndexForDim(index, 0)) { - return m_paddingValue; - } - inputIndex += (index - m_padding[0].first); - } else { - for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_outputStrides[i+1]; - if (isPaddingAtIndexForDim(idx, i)) { - return m_paddingValue; - } - inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; - index -= idx * m_outputStrides[i+1]; - } - if (isPaddingAtIndexForDim(index, NumDims-1)) { - return m_paddingValue; - } - inputIndex += (index - m_padding[NumDims-1].first); - } - return m_impl.coeff(inputIndex); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - if (static_cast(Layout) == static_cast(ColMajor)) { - return packetColMajor(index); - } - return packetRowMajor(index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - TensorOpCost cost = m_impl.costPerCoeff(vectorized); - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = 0; i < NumDims; ++i) - updateCostPerDimension(cost, i, i == 0); - } else { - for (int i = NumDims - 1; i >= 0; --i) - updateCostPerDimension(cost, i, i == NumDims - 1); - } - return cost; - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - - private: - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim( - Index index, int dim_index) const { -#if defined(EIGEN_HAS_INDEX_LIST) - return (!internal::index_pair_first_statically_eq(dim_index, 0) && - index < m_padding[dim_index].first) || - (!internal::index_pair_second_statically_eq(dim_index, 0) && - index >= m_dimensions[dim_index] - m_padding[dim_index].second); -#else - return (index < m_padding[dim_index].first) || - (index >= m_dimensions[dim_index] - m_padding[dim_index].second); -#endif - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isLeftPaddingCompileTimeZero( - int dim_index) const { -#if defined(EIGEN_HAS_INDEX_LIST) - return internal::index_pair_first_statically_eq(dim_index, 0); -#else - EIGEN_UNUSED_VARIABLE(dim_index); - return false; -#endif - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isRightPaddingCompileTimeZero( - int dim_index) const { -#if defined(EIGEN_HAS_INDEX_LIST) - return internal::index_pair_second_statically_eq(dim_index, 0); -#else - EIGEN_UNUSED_VARIABLE(dim_index); - return false; -#endif - } - - - void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const { - const double in = static_cast(m_impl.dimensions()[i]); - const double out = in + m_padding[i].first + m_padding[i].second; - if (out == 0) - return; - const double reduction = in / out; - cost *= reduction; - if (first) { - cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost() + - reduction * (1 * TensorOpCost::AddCost())); - } else { - cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost() + - 2 * TensorOpCost::MulCost() + - reduction * (2 * TensorOpCost::MulCost() + - 1 * TensorOpCost::DivCost())); - } - } - - protected: - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - const Index initialIndex = index; - Index inputIndex = 0; - for (int i = NumDims - 1; i > 0; --i) { - const Index first = index; - const Index last = index + PacketSize - 1; - const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i]; - const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; - const Index lastPaddedRight = m_outputStrides[i+1]; - - if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) { - // all the coefficient are in the padding zone. - return internal::pset1(m_paddingValue); - } - else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) { - // all the coefficient are in the padding zone. - return internal::pset1(m_paddingValue); - } - else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { - // all the coefficient are between the 2 padding zones. - const Index idx = index / m_outputStrides[i]; - inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; - index -= idx * m_outputStrides[i]; - } - else { - // Every other case - return packetWithPossibleZero(initialIndex); - } - } - - const Index last = index + PacketSize - 1; - const Index first = index; - const Index lastPaddedLeft = m_padding[0].first; - const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second); - const Index lastPaddedRight = m_outputStrides[1]; - - if (!isLeftPaddingCompileTimeZero(0) && last < lastPaddedLeft) { - // all the coefficient are in the padding zone. - return internal::pset1(m_paddingValue); - } - else if (!isRightPaddingCompileTimeZero(0) && first >= firstPaddedRight && last < lastPaddedRight) { - // all the coefficient are in the padding zone. - return internal::pset1(m_paddingValue); - } - else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { - // all the coefficient are between the 2 padding zones. - inputIndex += (index - m_padding[0].first); - return m_impl.template packet(inputIndex); - } - // Every other case - return packetWithPossibleZero(initialIndex); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - const Index initialIndex = index; - Index inputIndex = 0; - - for (int i = 0; i < NumDims - 1; ++i) { - const Index first = index; - const Index last = index + PacketSize - 1; - const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1]; - const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1]; - const Index lastPaddedRight = m_outputStrides[i]; - - if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) { - // all the coefficient are in the padding zone. - return internal::pset1(m_paddingValue); - } - else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) { - // all the coefficient are in the padding zone. - return internal::pset1(m_paddingValue); - } - else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { - // all the coefficient are between the 2 padding zones. - const Index idx = index / m_outputStrides[i+1]; - inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; - index -= idx * m_outputStrides[i+1]; - } - else { - // Every other case - return packetWithPossibleZero(initialIndex); - } - } - - const Index last = index + PacketSize - 1; - const Index first = index; - const Index lastPaddedLeft = m_padding[NumDims-1].first; - const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second); - const Index lastPaddedRight = m_outputStrides[NumDims-1]; - - if (!isLeftPaddingCompileTimeZero(NumDims-1) && last < lastPaddedLeft) { - // all the coefficient are in the padding zone. - return internal::pset1(m_paddingValue); - } - else if (!isRightPaddingCompileTimeZero(NumDims-1) && first >= firstPaddedRight && last < lastPaddedRight) { - // all the coefficient are in the padding zone. - return internal::pset1(m_paddingValue); - } - else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { - // all the coefficient are between the 2 padding zones. - inputIndex += (index - m_padding[NumDims-1].first); - return m_impl.template packet(inputIndex); - } - // Every other case - return packetWithPossibleZero(initialIndex); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const - { - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - for (int i = 0; i < PacketSize; ++i) { - values[i] = coeff(index+i); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - - Dimensions m_dimensions; - array m_outputStrides; - array m_inputStrides; - TensorEvaluator m_impl; - PaddingDimensions m_padding; - - Scalar m_paddingValue; -}; - - - - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_PADDING_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h deleted file mode 100644 index 886a254..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ /dev/null @@ -1,269 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_PATCH_H -#define EIGEN_CXX11_TENSOR_TENSOR_PATCH_H - -namespace Eigen { - -/** \class TensorPatch - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor patch class. - * - * - */ -namespace internal { -template -struct traits > : public traits -{ - typedef typename XprType::Scalar Scalar; - typedef traits XprTraits; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - typedef typename remove_reference::type _Nested; - static const int NumDimensions = XprTraits::NumDimensions + 1; - static const int Layout = XprTraits::Layout; -}; - -template -struct eval, Eigen::Dense> -{ - typedef const TensorPatchOp& type; -}; - -template -struct nested, 1, typename eval >::type> -{ - typedef TensorPatchOp type; -}; - -} // end namespace internal - - - -template -class TensorPatchOp : public TensorBase, ReadOnlyAccessors> -{ - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPatchOp(const XprType& expr, const PatchDim& patch_dims) - : m_xpr(expr), m_patch_dims(patch_dims) {} - - EIGEN_DEVICE_FUNC - const PatchDim& patch_dims() const { return m_patch_dims; } - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - expression() const { return m_xpr; } - - protected: - typename XprType::Nested m_xpr; - const PatchDim m_patch_dims; -}; - - -// Eval as rvalue -template -struct TensorEvaluator, Device> -{ - typedef TensorPatchOp XprType; - typedef typename XprType::Index Index; - static const int NumDims = internal::array_size::Dimensions>::value + 1; - typedef DSizes Dimensions; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - - enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device) - { - Index num_patches = 1; - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - const PatchDim& patch_dims = op.patch_dims(); - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = 0; i < NumDims-1; ++i) { - m_dimensions[i] = patch_dims[i]; - num_patches *= (input_dims[i] - patch_dims[i] + 1); - } - m_dimensions[NumDims-1] = num_patches; - - m_inputStrides[0] = 1; - m_patchStrides[0] = 1; - for (int i = 1; i < NumDims-1; ++i) { - m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; - m_patchStrides[i] = m_patchStrides[i-1] * (input_dims[i-1] - patch_dims[i-1] + 1); - } - m_outputStrides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; - } - } else { - for (int i = 0; i < NumDims-1; ++i) { - m_dimensions[i+1] = patch_dims[i]; - num_patches *= (input_dims[i] - patch_dims[i] + 1); - } - m_dimensions[0] = num_patches; - - m_inputStrides[NumDims-2] = 1; - m_patchStrides[NumDims-2] = 1; - for (int i = NumDims-3; i >= 0; --i) { - m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; - m_patchStrides[i] = m_patchStrides[i+1] * (input_dims[i+1] - patch_dims[i+1] + 1); - } - m_outputStrides[NumDims-1] = 1; - for (int i = NumDims-2; i >= 0; --i) { - m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { - m_impl.evalSubExprsIfNeeded(NULL); - return true; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - Index output_stride_index = (static_cast(Layout) == static_cast(ColMajor)) ? NumDims - 1 : 0; - // Find the location of the first element of the patch. - Index patchIndex = index / m_outputStrides[output_stride_index]; - // Find the offset of the element wrt the location of the first element. - Index patchOffset = index - patchIndex * m_outputStrides[output_stride_index]; - Index inputIndex = 0; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 2; i > 0; --i) { - const Index patchIdx = patchIndex / m_patchStrides[i]; - patchIndex -= patchIdx * m_patchStrides[i]; - const Index offsetIdx = patchOffset / m_outputStrides[i]; - patchOffset -= offsetIdx * m_outputStrides[i]; - inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; - } - } else { - for (int i = 0; i < NumDims - 2; ++i) { - const Index patchIdx = patchIndex / m_patchStrides[i]; - patchIndex -= patchIdx * m_patchStrides[i]; - const Index offsetIdx = patchOffset / m_outputStrides[i+1]; - patchOffset -= offsetIdx * m_outputStrides[i+1]; - inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; - } - } - inputIndex += (patchIndex + patchOffset); - return m_impl.coeff(inputIndex); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - Index output_stride_index = (static_cast(Layout) == static_cast(ColMajor)) ? NumDims - 1 : 0; - Index indices[2] = {index, index + PacketSize - 1}; - Index patchIndices[2] = {indices[0] / m_outputStrides[output_stride_index], - indices[1] / m_outputStrides[output_stride_index]}; - Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[output_stride_index], - indices[1] - patchIndices[1] * m_outputStrides[output_stride_index]}; - - Index inputIndices[2] = {0, 0}; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumDims - 2; i > 0; --i) { - const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], - patchIndices[1] / m_patchStrides[i]}; - patchIndices[0] -= patchIdx[0] * m_patchStrides[i]; - patchIndices[1] -= patchIdx[1] * m_patchStrides[i]; - - const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i], - patchOffsets[1] / m_outputStrides[i]}; - patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i]; - patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i]; - - inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i]; - inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i]; - } - } else { - for (int i = 0; i < NumDims - 2; ++i) { - const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], - patchIndices[1] / m_patchStrides[i]}; - patchIndices[0] -= patchIdx[0] * m_patchStrides[i]; - patchIndices[1] -= patchIdx[1] * m_patchStrides[i]; - - const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i+1], - patchOffsets[1] / m_outputStrides[i+1]}; - patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i+1]; - patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i+1]; - - inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i]; - inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i]; - } - } - inputIndices[0] += (patchIndices[0] + patchOffsets[0]); - inputIndices[1] += (patchIndices[1] + patchOffsets[1]); - - if (inputIndices[1] - inputIndices[0] == PacketSize - 1) { - PacketReturnType rslt = m_impl.template packet(inputIndices[0]); - return rslt; - } - else { - EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; - values[0] = m_impl.coeff(inputIndices[0]); - values[PacketSize-1] = m_impl.coeff(inputIndices[1]); - for (int i = 1; i < PacketSize-1; ++i) { - values[i] = coeff(index+i); - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - const double compute_cost = NumDims * (TensorOpCost::DivCost() + - TensorOpCost::MulCost() + - 2 * TensorOpCost::AddCost()); - return m_impl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); - } - - EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - - protected: - Dimensions m_dimensions; - array m_outputStrides; - array m_inputStrides; - array m_patchStrides; - - TensorEvaluator m_impl; -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_PATCH_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h deleted file mode 100644 index 1655a81..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h +++ /dev/null @@ -1,276 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H -#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H - -namespace Eigen { -namespace internal { - -namespace { - -EIGEN_DEVICE_FUNC uint64_t get_random_seed() { -#ifdef __CUDA_ARCH__ - // We don't support 3d kernels since we currently only use 1 and - // 2d kernels. - assert(threadIdx.z == 0); - return clock64() + - blockIdx.x * blockDim.x + threadIdx.x + - gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y); - -#elif defined _WIN32 - // Use the current time as a baseline. - SYSTEMTIME st; - GetSystemTime(&st); - int time = st.wSecond + 1000 * st.wMilliseconds; - // Mix in a random number to make sure that we get different seeds if - // we try to generate seeds faster than the clock resolution. - // We need 2 random values since the generator only generate 16 bits at - // a time (https://msdn.microsoft.com/en-us/library/398ax69y.aspx) - int rnd1 = ::rand(); - int rnd2 = ::rand(); - uint64_t rnd = (rnd1 | rnd2 << 16) ^ time; - return rnd; - -#elif defined __APPLE__ - // Same approach as for win32, except that the random number generator - // is better (// https://developer.apple.com/legacy/library/documentation/Darwin/Reference/ManPages/man3/random.3.html#//apple_ref/doc/man/3/random). - uint64_t rnd = ::random() ^ mach_absolute_time(); - return rnd; - -#else - // Augment the current time with pseudo random number generation - // to ensure that we get different seeds if we try to generate seeds - // faster than the clock resolution. - timespec ts; - clock_gettime(CLOCK_REALTIME, &ts); - uint64_t rnd = ::random() ^ ts.tv_nsec; - return rnd; -#endif -} - -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) { - // TODO: Unify with the implementation in the non blocking thread pool. - uint64_t current = *state; - // Update the internal state - *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; - // Generate the random output (using the PCG-XSH-RS scheme) - return static_cast((current ^ (current >> 22)) >> (22 + (current >> 61))); -} - -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) { - seed = seed ? seed : get_random_seed(); - return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; -} - -} // namespace - - -template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -T RandomToTypeUniform(uint64_t* state) { - unsigned rnd = PCG_XSH_RS_generator(state); - return static_cast(rnd); -} - - -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -Eigen::half RandomToTypeUniform(uint64_t* state) { - Eigen::half result; - // Generate 10 random bits for the mantissa - unsigned rnd = PCG_XSH_RS_generator(state); - result.x = static_cast(rnd & 0x3ffu); - // Set the exponent - result.x |= (static_cast(15) << 10); - // Return the final result - return result - Eigen::half(1.0f); -} - - -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -float RandomToTypeUniform(uint64_t* state) { - typedef union { - uint32_t raw; - float fp; - } internal; - internal result; - // Generate 23 random bits for the mantissa mantissa - const unsigned rnd = PCG_XSH_RS_generator(state); - result.raw = rnd & 0x7fffffu; - // Set the exponent - result.raw |= (static_cast(127) << 23); - // Return the final result - return result.fp - 1.0f; -} - -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -double RandomToTypeUniform(uint64_t* state) { - typedef union { - uint64_t raw; - double dp; - } internal; - internal result; - result.raw = 0; - // Generate 52 random bits for the mantissa - // First generate the upper 20 bits - unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu; - // The generate the lower 32 bits - unsigned rnd2 = PCG_XSH_RS_generator(state); - result.raw = (static_cast(rnd1) << 32) | rnd2; - // Set the exponent - result.raw |= (static_cast(1023) << 52); - // Return the final result - return result.dp - 1.0; -} - -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex RandomToTypeUniform >(uint64_t* state) { - return std::complex(RandomToTypeUniform(state), - RandomToTypeUniform(state)); -} -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex RandomToTypeUniform >(uint64_t* state) { - return std::complex(RandomToTypeUniform(state), - RandomToTypeUniform(state)); -} - -template class UniformRandomGenerator { - public: - static const bool PacketAccess = true; - - // Uses the given "seed" if non-zero, otherwise uses a random seed. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( - uint64_t seed = 0) { - m_state = PCG_XSH_RS_state(seed); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( - const UniformRandomGenerator& other) { - m_state = other.m_state; - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - T operator()(Index i) const { - uint64_t local_state = m_state + i; - T result = RandomToTypeUniform(&local_state); - m_state = local_state; - return result; - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Packet packetOp(Index i) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_ALIGN_MAX T values[packetSize]; - uint64_t local_state = m_state + i; - for (int j = 0; j < packetSize; ++j) { - values[j] = RandomToTypeUniform(&local_state); - } - m_state = local_state; - return internal::pload(values); - } - - private: - mutable uint64_t m_state; -}; - -template -struct functor_traits > { - enum { - // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)). - Cost = 12 * NumTraits::AddCost * - ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)), - PacketAccess = UniformRandomGenerator::PacketAccess - }; -}; - - - -template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -T RandomToTypeNormal(uint64_t* state) { - // Use the ratio of uniform method to generate numbers following a normal - // distribution. See for example Numerical Recipes chapter 7.3.9 for the - // details. - T u, v, q; - do { - u = RandomToTypeUniform(state); - v = T(1.7156) * (RandomToTypeUniform(state) - T(0.5)); - const T x = u - T(0.449871); - const T y = numext::abs(v) + T(0.386595); - q = x*x + y * (T(0.196)*y - T(0.25472)*x); - } while (q > T(0.27597) && - (q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u)); - - return v/u; -} - -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex RandomToTypeNormal >(uint64_t* state) { - return std::complex(RandomToTypeNormal(state), - RandomToTypeNormal(state)); -} -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex RandomToTypeNormal >(uint64_t* state) { - return std::complex(RandomToTypeNormal(state), - RandomToTypeNormal(state)); -} - - -template class NormalRandomGenerator { - public: - static const bool PacketAccess = true; - - // Uses the given "seed" if non-zero, otherwise uses a random seed. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) { - m_state = PCG_XSH_RS_state(seed); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator( - const NormalRandomGenerator& other) { - m_state = other.m_state; - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - T operator()(Index i) const { - uint64_t local_state = m_state + i; - T result = RandomToTypeNormal(&local_state); - m_state = local_state; - return result; - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Packet packetOp(Index i) const { - const int packetSize = internal::unpacket_traits::size; - EIGEN_ALIGN_MAX T values[packetSize]; - uint64_t local_state = m_state + i; - for (int j = 0; j < packetSize; ++j) { - values[j] = RandomToTypeNormal(&local_state); - } - m_state = local_state; - return internal::pload(values); - } - - private: - mutable uint64_t m_state; -}; - - -template -struct functor_traits > { - enum { - // On average, we need to generate about 3 random numbers - // 15 mul, 8 add, 1.5 logs - Cost = 3 * functor_traits >::Cost + - 15 * NumTraits::AddCost + 8 * NumTraits::AddCost + - 3 * functor_traits >::Cost / 2, - PacketAccess = NormalRandomGenerator::PacketAccess - }; -}; - - -} // end namespace internal -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h deleted file mode 100644 index 41d0d00..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ /dev/null @@ -1,781 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// Copyright (C) 2016 Mehdi Goli, Codeplay Software Ltd -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H -#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H - -namespace Eigen { - -/** \class TensorReduction - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor reduction class. - * - */ - -namespace internal { - template class MakePointer_ > - struct traits > - : traits -{ - typedef traits XprTraits; - typedef typename XprTraits::Scalar Scalar; - typedef typename XprTraits::StorageKind StorageKind; - typedef typename XprTraits::Index Index; - typedef typename XprType::Nested Nested; - static const int NumDimensions = XprTraits::NumDimensions - array_size::value; - static const int Layout = XprTraits::Layout; - - template struct MakePointer { - // Intermediate typedef to workaround MSVC issue. - typedef MakePointer_ MakePointerT; - typedef typename MakePointerT::Type Type; - }; -}; - -template class MakePointer_> -struct eval, Eigen::Dense> -{ - typedef const TensorReductionOp& type; -}; - -template class MakePointer_> -struct nested, 1, typename eval >::type> -{ - typedef TensorReductionOp type; -}; - - -template struct DimInitializer { - template EIGEN_DEVICE_FUNC - static void run(const InputDims& input_dims, - const array::value>& reduced, - OutputDims* output_dims, ReducedDims* reduced_dims) { - const int NumInputDims = internal::array_size::value; - int outputIndex = 0; - int reduceIndex = 0; - for (int i = 0; i < NumInputDims; ++i) { - if (reduced[i]) { - (*reduced_dims)[reduceIndex] = input_dims[i]; - ++reduceIndex; - } else { - (*output_dims)[outputIndex] = input_dims[i]; - ++outputIndex; - } - } - } -}; - -template <> struct DimInitializer > { - template EIGEN_DEVICE_FUNC - static void run(const InputDims& input_dims, const array&, - Sizes<>*, array* reduced_dims) { - const int NumInputDims = internal::array_size::value; - for (int i = 0; i < NumInputDims; ++i) { - (*reduced_dims)[i] = input_dims[i]; - } - } -}; - - -template -struct are_inner_most_dims { - static const bool value = false; -}; -template -struct preserve_inner_most_dims { - static const bool value = false; -}; - -#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES -template -struct are_inner_most_dims{ - static const bool tmp1 = indices_statically_known_to_increase(); - static const bool tmp2 = index_statically_eq(0, 0); - static const bool tmp3 = index_statically_eq(array_size::value-1, array_size::value-1); - static const bool value = tmp1 & tmp2 & tmp3; -}; -template -struct are_inner_most_dims{ - static const bool tmp1 = indices_statically_known_to_increase(); - static const bool tmp2 = index_statically_eq(0, NumTensorDims - array_size::value); - static const bool tmp3 = index_statically_eq(array_size::value - 1, NumTensorDims - 1); - static const bool value = tmp1 & tmp2 & tmp3; - -}; -template -struct preserve_inner_most_dims{ - static const bool tmp1 = indices_statically_known_to_increase(); - static const bool tmp2 = index_statically_gt(0, 0); - static const bool value = tmp1 & tmp2; - -}; -template -struct preserve_inner_most_dims{ - static const bool tmp1 = indices_statically_known_to_increase(); - static const bool tmp2 = index_statically_lt(array_size::value - 1, NumTensorDims - 1); - static const bool value = tmp1 & tmp2; -}; -#endif - - -template -struct GenericDimReducer { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) { - EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); - for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) { - const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; - GenericDimReducer::reduce(self, input, reducer, accum); - } - } -}; -template -struct GenericDimReducer<0, Self, Op> { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) { - for (int j = 0; j < self.m_reducedDims[0]; ++j) { - const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0]; - reducer.reduce(self.m_impl.coeff(input), accum); - } - } -}; -template -struct GenericDimReducer<-1, Self, Op> { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer, typename Self::CoeffReturnType* accum) { - reducer.reduce(self.m_impl.coeff(index), accum); - } -}; - -template -struct InnerMostDimReducer { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { - typename Self::CoeffReturnType accum = reducer.initialize(); - for (typename Self::Index j = 0; j < numValuesToReduce; ++j) { - reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); - } - return reducer.finalize(accum); - } -}; - -template -struct InnerMostDimReducer { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { - const int packetSize = internal::unpacket_traits::size; - const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize; - typename Self::PacketReturnType p = reducer.template initializePacket(); - for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) { - reducer.reducePacket(self.m_impl.template packet(firstIndex + j), &p); - } - typename Self::CoeffReturnType accum = reducer.initialize(); - for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) { - reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); - } - return reducer.finalizeBoth(accum, p); - } -}; - -template -struct InnerMostDimPreserver { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) { - eigen_assert(false && "should never be called"); - } -}; - -template -struct InnerMostDimPreserver { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { - EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); - for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) { - const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; - InnerMostDimPreserver::reduce(self, input, reducer, accum); - } - } -}; - -template -struct InnerMostDimPreserver<0, Self, Op, true> { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { - for (typename Self::Index j = 0; j < self.m_reducedDims[0]; ++j) { - const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0]; - reducer.reducePacket(self.m_impl.template packet(input), accum); - } - } -}; -template -struct InnerMostDimPreserver<-1, Self, Op, true> { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) { - eigen_assert(false && "should never be called"); - } -}; - -// Default full reducer -template -struct FullReducer { - static const bool HasOptimizedImplementation = false; - - static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) { - const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions()); - *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); - } -}; - - -#ifdef EIGEN_USE_THREADS -// Multithreaded full reducers -template -struct FullReducerShard { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex, - typename Self::Index numValuesToReduce, Op& reducer, - typename Self::CoeffReturnType* output) { - *output = InnerMostDimReducer::reduce( - self, firstIndex, numValuesToReduce, reducer); - } -}; - -// Multithreaded full reducer -template -struct FullReducer { - static const bool HasOptimizedImplementation = !Op::IsStateful; - static const int PacketSize = - unpacket_traits::size; - - // launch one reducer per thread and accumulate the result. - static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, - typename Self::CoeffReturnType* output) { - typedef typename Self::Index Index; - const Index num_coeffs = array_prod(self.m_impl.dimensions()); - if (num_coeffs == 0) { - *output = reducer.finalize(reducer.initialize()); - return; - } - const TensorOpCost cost = - self.m_impl.costPerCoeff(Vectorizable) + - TensorOpCost(0, 0, internal::functor_traits::Cost, Vectorizable, - PacketSize); - const int num_threads = TensorCostModel::numThreads( - num_coeffs, cost, device.numThreads()); - if (num_threads == 1) { - *output = - InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); - return; - } - const Index blocksize = - std::floor(static_cast(num_coeffs) / num_threads); - const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; - eigen_assert(num_coeffs >= numblocks * blocksize); - - Barrier barrier(internal::convert_index(numblocks)); - MaxSizeVector shards(numblocks, reducer.initialize()); - for (Index i = 0; i < numblocks; ++i) { - device.enqueue_with_barrier(&barrier, &FullReducerShard::run, - self, i * blocksize, blocksize, reducer, - &shards[i]); - } - typename Self::CoeffReturnType finalShard; - if (numblocks * blocksize < num_coeffs) { - finalShard = InnerMostDimReducer::reduce( - self, numblocks * blocksize, num_coeffs - numblocks * blocksize, - reducer); - } else { - finalShard = reducer.initialize(); - } - barrier.Wait(); - - for (Index i = 0; i < numblocks; ++i) { - reducer.reduce(shards[i], &finalShard); - } - *output = reducer.finalize(finalShard); - } -}; - -#endif - - -// Default inner reducer -template -struct InnerReducer { - static const bool HasOptimizedImplementation = false; - - EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { - eigen_assert(false && "Not implemented"); - return true; - } -}; - -// Default outer reducer -template -struct OuterReducer { - static const bool HasOptimizedImplementation = false; - - EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { - eigen_assert(false && "Not implemented"); - return true; - } -}; - - -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) -template -__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); - - -#ifdef EIGEN_HAS_CUDA_FP16 -template -__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); -template -__global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*); -template -__global__ void InnerReductionKernelHalfFloat(R, const S, I, I, half*); - -#endif - -template -__global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); - -template -__global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); -#endif - -} // end namespace internal - - -template class MakePointer_> -class TensorReductionOp : public TensorBase, ReadOnlyAccessors> { - public: - typedef typename Eigen::internal::traits::Scalar Scalar; - typedef typename Eigen::NumTraits::Real RealScalar; - typedef typename internal::remove_const::type CoeffReturnType; - typedef typename Eigen::internal::nested::type Nested; - typedef typename Eigen::internal::traits::StorageKind StorageKind; - typedef typename Eigen::internal::traits::Index Index; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims) - { } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer) - { } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const XprType& expression() const { return m_expr; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Dims& dims() const { return m_dims; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Op& reducer() const { return m_reducer; } - - protected: - typename XprType::Nested m_expr; - const Dims m_dims; - const Op m_reducer; -}; - - -// Eval as rvalue -template class MakePointer_, typename Device> -struct TensorEvaluator, Device> -{ - typedef TensorReductionOp XprType; - typedef typename XprType::Index Index; - typedef ArgType ChildType; - typedef typename TensorEvaluator::Dimensions InputDimensions; - static const int NumInputDims = internal::array_size::value; - static const int NumReducedDims = internal::array_size::value; - static const int NumOutputDims = NumInputDims - NumReducedDims; - typedef typename internal::conditional, DSizes >::type Dimensions; - typedef typename XprType::Scalar Scalar; - typedef TensorEvaluator, Device> Self; - static const bool InputPacketAccess = TensorEvaluator::PacketAccess; - typedef typename internal::remove_const::type CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits::size; - - enum { - IsAligned = false, - PacketAccess = Self::InputPacketAccess && Op::PacketAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - static const bool ReducingInnerMostDims = internal::are_inner_most_dims::value; - static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims::value; - static const bool RunningFullReduction = (NumOutputDims==0); - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device), m_xpr_dims(op.dims()) - { - EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE); - EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)), - YOU_MADE_A_PROGRAMMING_MISTAKE); - - // Build the bitmap indicating if an input dimension is reduced or not. - for (int i = 0; i < NumInputDims; ++i) { - m_reduced[i] = false; - } - for (int i = 0; i < NumReducedDims; ++i) { - eigen_assert(op.dims()[i] >= 0); - eigen_assert(op.dims()[i] < NumInputDims); - m_reduced[op.dims()[i]] = true; - } - - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); - internal::DimInitializer::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims); - - // Precompute output strides. - if (NumOutputDims > 0) { - if (static_cast(Layout) == static_cast(ColMajor)) { - m_outputStrides[0] = 1; - for (int i = 1; i < NumOutputDims; ++i) { - m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; - } - } else { - m_outputStrides.back() = 1; - for (int i = NumOutputDims - 2; i >= 0; --i) { - m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; - } - } - } - - // Precompute input strides. - if (NumInputDims > 0) { - array input_strides; - if (static_cast(Layout) == static_cast(ColMajor)) { - input_strides[0] = 1; - for (int i = 1; i < NumInputDims; ++i) { - input_strides[i] = input_strides[i-1] * input_dims[i-1]; - } - } else { - input_strides.back() = 1; - for (int i = NumInputDims - 2; i >= 0; --i) { - input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; - } - } - - int outputIndex = 0; - int reduceIndex = 0; - for (int i = 0; i < NumInputDims; ++i) { - if (m_reduced[i]) { - m_reducedStrides[reduceIndex] = input_strides[i]; - ++reduceIndex; - } else { - m_preservedStrides[outputIndex] = input_strides[i]; - ++outputIndex; - } - } - } - - // Special case for full reductions - if (NumOutputDims == 0) { - m_preservedStrides[0] = internal::array_prod(input_dims); - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(typename MakePointer_::Type data) { - m_impl.evalSubExprsIfNeeded(NULL); - - // Use the FullReducer if possible. - if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction && - internal::FullReducer::HasOptimizedImplementation && - ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || - !RunningOnGPU))) { - bool need_assign = false; - if (!data) { - m_result = static_cast(m_device.allocate(sizeof(CoeffReturnType))); - data = m_result; - need_assign = true; - } - Op reducer(m_reducer); - internal::FullReducer::run(*this, reducer, m_device, data); - return need_assign; - } - else if(RunningOnSycl){ - const Index num_values_to_reduce = internal::array_prod(m_reducedDims); - const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); - if (!data) { - data = static_cast(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); - m_result = data; - } - Op reducer(m_reducer); - internal::InnerReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); - return (m_result != NULL); - } - - // Attempt to use an optimized reduction. - else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) { - bool reducing_inner_dims = true; - for (int i = 0; i < NumReducedDims; ++i) { - if (static_cast(Layout) == static_cast(ColMajor)) { - reducing_inner_dims &= m_reduced[i]; - } else { - reducing_inner_dims &= m_reduced[NumInputDims - 1 - i]; - } - } - if (internal::InnerReducer::HasOptimizedImplementation && - (reducing_inner_dims || ReducingInnerMostDims)) { - const Index num_values_to_reduce = internal::array_prod(m_reducedDims); - const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); - if (!data) { - if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) { - data = static_cast(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); - m_result = data; - } - else { - return true; - } - } - Op reducer(m_reducer); - if (internal::InnerReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { - if (m_result) { - m_device.deallocate(m_result); - m_result = NULL; - } - return true; - } else { - return (m_result != NULL); - } - } - - bool preserving_inner_dims = true; - for (int i = 0; i < NumReducedDims; ++i) { - if (static_cast(Layout) == static_cast(ColMajor)) { - preserving_inner_dims &= m_reduced[NumInputDims - 1 - i]; - } else { - preserving_inner_dims &= m_reduced[i]; - } - } - if (internal::OuterReducer::HasOptimizedImplementation && - preserving_inner_dims) { - const Index num_values_to_reduce = internal::array_prod(m_reducedDims); - const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); - if (!data) { - if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) { - data = static_cast(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); - m_result = data; - } - else { - return true; - } - } - Op reducer(m_reducer); - if (internal::OuterReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { - if (m_result) { - m_device.deallocate(m_result); - m_result = NULL; - } - return true; - } else { - return (m_result != NULL); - } - } - } - return true; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_impl.cleanup(); - if (m_result) { - m_device.deallocate(m_result); - m_result = NULL; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - if ((RunningOnSycl || RunningFullReduction || RunningOnGPU) && m_result) { - return *(m_result + index); - } - Op reducer(m_reducer); - if (ReducingInnerMostDims || RunningFullReduction) { - const Index num_values_to_reduce = - (static_cast(Layout) == static_cast(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1]; - return internal::InnerMostDimReducer::reduce(*this, firstInput(index), - num_values_to_reduce, reducer); - } else { - typename Self::CoeffReturnType accum = reducer.initialize(); - internal::GenericDimReducer::reduce(*this, firstInput(index), reducer, &accum); - return reducer.finalize(accum); - } - } - - // TODO(bsteiner): provide a more efficient implementation. - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions()))); - - if (RunningOnGPU && m_result) { - return internal::pload(m_result + index); - } - - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - if (ReducingInnerMostDims) { - const Index num_values_to_reduce = - (static_cast(Layout) == static_cast(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1]; - const Index firstIndex = firstInput(index); - for (Index i = 0; i < PacketSize; ++i) { - Op reducer(m_reducer); - values[i] = internal::InnerMostDimReducer::reduce(*this, firstIndex + i * num_values_to_reduce, - num_values_to_reduce, reducer); - } - } else if (PreservingInnerMostDims) { - const Index firstIndex = firstInput(index); - const int innermost_dim = (static_cast(Layout) == static_cast(ColMajor)) ? 0 : NumOutputDims - 1; - // TBD: extend this the the n innermost dimensions that we preserve. - if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) { - Op reducer(m_reducer); - typename Self::PacketReturnType accum = reducer.template initializePacket(); - internal::InnerMostDimPreserver::reduce(*this, firstIndex, reducer, &accum); - return reducer.finalizePacket(accum); - } else { - for (int i = 0; i < PacketSize; ++i) { - values[i] = coeff(index + i); - } - } - } else { - for (int i = 0; i < PacketSize; ++i) { - values[i] = coeff(index + i); - } - } - PacketReturnType rslt = internal::pload(values); - return rslt; - } - - // Must be called after evalSubExprsIfNeeded(). - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - if (RunningFullReduction && m_result) { - return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); - } else { - const Index num_values_to_reduce = internal::array_prod(m_reducedDims); - const double compute_cost = num_values_to_reduce * internal::functor_traits::Cost; - return m_impl.costPerCoeff(vectorized) * num_values_to_reduce + - TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); - } - } - - EIGEN_DEVICE_FUNC typename MakePointer_::Type data() const { return m_result; } - /// required by sycl in order to extract the accessor - const TensorEvaluator& impl() const { return m_impl; } - /// added for sycl in order to construct the buffer from the sycl device - const Device& device() const{return m_device;} - /// added for sycl in order to re-construct the reduction eval on the device for the sub-kernel - const Dims& xprDims() const {return m_xpr_dims;} - - - private: - template friend struct internal::GenericDimReducer; - template friend struct internal::InnerMostDimReducer; - template friend struct internal::InnerMostDimPreserver; - template friend struct internal::FullReducer; -#ifdef EIGEN_USE_THREADS - template friend struct internal::FullReducerShard; -#endif -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) - template friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); -#ifdef EIGEN_HAS_CUDA_FP16 - template friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); - template friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); - template friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*); -#endif - template friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); - - template friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); -#endif - - template friend struct internal::InnerReducer; - - // Returns the Index in the input tensor of the first value that needs to be - // used to compute the reduction at output index "index". - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { - if (ReducingInnerMostDims) { - if (static_cast(Layout) == static_cast(ColMajor)) { - return index * m_preservedStrides[0]; - } else { - return index * m_preservedStrides[NumPreservedStrides - 1]; - } - } - // TBD: optimize the case where we preserve the innermost dimensions. - Index startInput = 0; - if (static_cast(Layout) == static_cast(ColMajor)) { - for (int i = NumOutputDims - 1; i > 0; --i) { - // This is index_i in the output tensor. - const Index idx = index / m_outputStrides[i]; - startInput += idx * m_preservedStrides[i]; - index -= idx * m_outputStrides[i]; - } - if (PreservingInnerMostDims) { - eigen_assert(m_preservedStrides[0] == 1); - startInput += index; - } else { - startInput += index * m_preservedStrides[0]; - } - } else { - for (int i = 0; i < NumOutputDims - 1; ++i) { - // This is index_i in the output tensor. - const Index idx = index / m_outputStrides[i]; - startInput += idx * m_preservedStrides[i]; - index -= idx * m_outputStrides[i]; - } - if (PreservingInnerMostDims) { - eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1); - startInput += index; - } else { - startInput += index * m_preservedStrides[NumPreservedStrides - 1]; - } - } - return startInput; - } - - // Bitmap indicating if an input dimension is reduced or not. - array m_reduced; - // Dimensions of the output of the operation. - Dimensions m_dimensions; - // Precomputed strides for the output tensor. - array m_outputStrides; - // Subset of strides of the input tensor for the non-reduced dimensions. - // Indexed by output dimensions. - static const int NumPreservedStrides = max_n_1::size; - array m_preservedStrides; - - // Subset of strides of the input tensor for the reduced dimensions. - // Indexed by reduced dimensions. - array m_reducedStrides; - // Size of the input dimensions that are reduced. - // Indexed by reduced dimensions. - array m_reducedDims; - - // Evaluator for the input expression. - TensorEvaluator m_impl; - - // Operation to apply for computing the reduction. - Op m_reducer; - - // For full reductions -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) - static const bool RunningOnGPU = internal::is_same::value; - static const bool RunningOnSycl = false; -#elif defined(EIGEN_USE_SYCL) -static const bool RunningOnSycl = internal::is_same::type, Eigen::SyclDevice>::value; -static const bool RunningOnGPU = false; -#else - static const bool RunningOnGPU = false; - static const bool RunningOnSycl = false; -#endif - typename MakePointer_::Type m_result; - - const Device& m_device; - const Dims& m_xpr_dims; -}; - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H diff --git a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h deleted file mode 100644 index 65638b6..0000000 --- a/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ /dev/null @@ -1,750 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H -#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H - -namespace Eigen { -namespace internal { - - -#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) -// Full reducers for GPU, don't vectorize for now - -// Reducer function that enables multiple cuda thread to safely accumulate at the same -// output address. It basically reads the current value of the output variable, and -// attempts to update it with the new value. If in the meantime another cuda thread -// updated the content of the output address it will try again. -template -__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { -#if __CUDA_ARCH__ >= 300 - if (sizeof(T) == 4) - { - unsigned int oldval = *reinterpret_cast(output); - unsigned int newval = oldval; - reducer.reduce(accum, reinterpret_cast(&newval)); - if (newval == oldval) { - return; - } - unsigned int readback; - while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { - oldval = readback; - newval = oldval; - reducer.reduce(accum, reinterpret_cast(&newval)); - if (newval == oldval) { - return; - } - } - } - else if (sizeof(T) == 8) { - unsigned long long oldval = *reinterpret_cast(output); - unsigned long long newval = oldval; - reducer.reduce(accum, reinterpret_cast(&newval)); - if (newval == oldval) { - return; - } - unsigned long long readback; - while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) { - oldval = readback; - newval = oldval; - reducer.reduce(accum, reinterpret_cast(&newval)); - if (newval == oldval) { - return; - } - } - } - else { - assert(0 && "Wordsize not supported"); - } -#else - assert(0 && "Shouldn't be called on unsupported device"); -#endif -} - -// We extend atomicExch to support extra data types -template -__device__ inline Type atomicExchCustom(Type* address, Type val) { - return atomicExch(address, val); -} - -template <> -__device__ inline double atomicExchCustom(double* address, double val) { - unsigned long long int* address_as_ull = reinterpret_cast(address); - return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val))); -} - -#ifdef EIGEN_HAS_CUDA_FP16 -template