summaryrefslogtreecommitdiffhomepage
path: root/eigen/Eigen/src/Core/arch
diff options
context:
space:
mode:
authorStanislaw Halik <sthalik@misaki.pl>2019-01-16 11:45:13 +0100
committerStanislaw Halik <sthalik@misaki.pl>2019-01-16 11:45:13 +0100
commitbbdfe42628cc324904a49d472230c8cbbfd9e1d5 (patch)
tree0ae6a380649af4a854c88245abb1c9fa3a571cc4 /eigen/Eigen/src/Core/arch
parent3e07e568a1ae478b89812d91438d75179c94ab35 (diff)
update eigen
Diffstat (limited to 'eigen/Eigen/src/Core/arch')
-rw-r--r--eigen/Eigen/src/Core/arch/AVX/PacketMath.h13
-rw-r--r--eigen/Eigen/src/Core/arch/AVX512/PacketMath.h6
-rw-r--r--eigen/Eigen/src/Core/arch/CUDA/Half.h193
-rw-r--r--eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h3
-rw-r--r--eigen/Eigen/src/Core/arch/SSE/PacketMath.h2
5 files changed, 121 insertions, 96 deletions
diff --git a/eigen/Eigen/src/Core/arch/AVX/PacketMath.h b/eigen/Eigen/src/Core/arch/AVX/PacketMath.h
index 61c3dfc..923a124 100644
--- a/eigen/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/eigen/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -159,11 +159,12 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, co
#ifdef __FMA__
template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
-#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) )
- // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
- // and gcc stupidly generates a vfmadd132ps instruction,
- // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate
- // the result of the product.
+#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) )
+ // Clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
+ // and even register spilling with clang>=6.0 (bug 1637).
+ // Gcc stupidly generates a vfmadd132ps instruction.
+ // So let's enforce it to generate a vfmadd231ps instruction since the most common use
+ // case is to accumulate the result of the product.
Packet8f res = c;
__asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
return res;
@@ -172,7 +173,7 @@ template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f&
#endif
}
template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
-#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) )
+#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) )
// see above
Packet4d res = c;
__asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
diff --git a/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h b/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h
index 8970524..5adddc7 100644
--- a/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -648,13 +648,13 @@ template<> EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a)
template<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a)
{
// _mm512_abs_ps intrinsic not found, so hack around it
- return (__m512)_mm512_and_si512((__m512i)a, _mm512_set1_epi32(0x7fffffff));
+ return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x7fffffff)));
}
template <>
EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
// _mm512_abs_ps intrinsic not found, so hack around it
- return (__m512d)_mm512_and_si512((__m512i)a,
- _mm512_set1_epi64(0x7fffffffffffffff));
+ return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a),
+ _mm512_set1_epi64(0x7fffffffffffffff)));
}
#ifdef EIGEN_VECTORIZE_AVX512DQ
diff --git a/eigen/Eigen/src/Core/arch/CUDA/Half.h b/eigen/Eigen/src/Core/arch/CUDA/Half.h
index 02ac0c2..755e620 100644
--- a/eigen/Eigen/src/Core/arch/CUDA/Half.h
+++ b/eigen/Eigen/src/Core/arch/CUDA/Half.h
@@ -29,7 +29,7 @@
// type Eigen::half (inheriting from CUDA's __half struct) with
// operator overloads such that it behaves basically as an arithmetic
// type. It will be quite slow on CPUs (so it is recommended to stay
-// in fp32 for CPUs, except for simple parameter conversions, I/O
+// in float32_bits for CPUs, except for simple parameter conversions, I/O
// to disk and the likes), but fast on GPUs.
@@ -50,38 +50,45 @@ struct half;
namespace half_impl {
#if !defined(EIGEN_HAS_CUDA_FP16)
-
-// Make our own __half definition that is similar to CUDA's.
-struct __half {
- EIGEN_DEVICE_FUNC __half() {}
- explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {}
+// Make our own __half_raw definition that is similar to CUDA's.
+struct __half_raw {
+ EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
+ explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
unsigned short x;
};
-
+#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
+// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
+typedef __half __half_raw;
#endif
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x);
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff);
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
-struct half_base : public __half {
+struct half_base : public __half_raw {
EIGEN_DEVICE_FUNC half_base() {}
- EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half(h) {}
- EIGEN_DEVICE_FUNC half_base(const __half& h) : __half(h) {}
+ EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
+ EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
+ EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
+#endif
};
} // namespace half_impl
// Class definition.
struct half : public half_impl::half_base {
- #if !defined(EIGEN_HAS_CUDA_FP16)
- typedef half_impl::__half __half;
+ #if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
+ typedef half_impl::__half_raw __half_raw;
#endif
EIGEN_DEVICE_FUNC half() {}
- EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
+ EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
+ EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
+#endif
explicit EIGEN_DEVICE_FUNC half(bool b)
: half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
@@ -138,12 +145,66 @@ struct half : public half_impl::half_base {
}
};
+} // end namespace Eigen
+
+namespace std {
+template<>
+struct numeric_limits<Eigen::half> {
+ static const bool is_specialized = true;
+ static const bool is_signed = true;
+ static const bool is_integer = false;
+ static const bool is_exact = false;
+ static const bool has_infinity = true;
+ static const bool has_quiet_NaN = true;
+ static const bool has_signaling_NaN = true;
+ static const float_denorm_style has_denorm = denorm_present;
+ static const bool has_denorm_loss = false;
+ static const std::float_round_style round_style = std::round_to_nearest;
+ static const bool is_iec559 = false;
+ static const bool is_bounded = false;
+ static const bool is_modulo = false;
+ static const int digits = 11;
+ static const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
+ static const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
+ static const int radix = 2;
+ static const int min_exponent = -13;
+ static const int min_exponent10 = -4;
+ static const int max_exponent = 16;
+ static const int max_exponent10 = 4;
+ static const bool traps = true;
+ static const bool tinyness_before = false;
+
+ static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); }
+ static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
+ static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
+ static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); }
+ static Eigen::half round_error() { return Eigen::half(0.5); }
+ static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
+ static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
+ static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
+ static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); }
+};
+
+// If std::numeric_limits<T> is specialized, should also specialize
+// std::numeric_limits<const T>, std::numeric_limits<volatile T>, and
+// std::numeric_limits<const volatile T>
+// https://stackoverflow.com/a/16519653/
+template<>
+struct numeric_limits<const Eigen::half> : numeric_limits<Eigen::half> {};
+template<>
+struct numeric_limits<volatile Eigen::half> : numeric_limits<Eigen::half> {};
+template<>
+struct numeric_limits<const volatile Eigen::half> : numeric_limits<Eigen::half> {};
+} // end namespace std
+
+namespace Eigen {
+
namespace half_impl {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
// Intrinsics for native fp16 support. Note that on current hardware,
-// these are no faster than fp32 arithmetic (you need to use the half2
+// these are no faster than float32_bits arithmetic (you need to use the half2
// versions to get the ALU speed increased), but you do save the
// conversion steps back and forth.
@@ -202,7 +263,7 @@ EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
#else // Emulate support for half floats
// Definitions for CPUs and older CUDA, mostly working through conversion
-// to/from fp32.
+// to/from float32_bits.
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
return half(float(a) + float(b));
@@ -269,34 +330,35 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
// these in hardware. If we need more performance on older/other CPUs, they are
// also possible to vectorize directly.
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) {
- __half h;
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x) {
+ __half_raw h;
h.x = x;
return h;
}
-union FP32 {
+union float32_bits {
unsigned int u;
float f;
};
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
- return __float2half(ff);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
+ __half tmp_ff = __float2half(ff);
+ return *(__half_raw*)&tmp_ff;
#elif defined(EIGEN_HAS_FP16_C)
- __half h;
+ __half_raw h;
h.x = _cvtss_sh(ff, 0);
return h;
#else
- FP32 f; f.f = ff;
+ float32_bits f; f.f = ff;
- const FP32 f32infty = { 255 << 23 };
- const FP32 f16max = { (127 + 16) << 23 };
- const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
+ const float32_bits f32infty = { 255 << 23 };
+ const float32_bits f16max = { (127 + 16) << 23 };
+ const float32_bits denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
unsigned int sign_mask = 0x80000000u;
- __half o;
+ __half_raw o;
o.x = static_cast<unsigned short>(0x0u);
unsigned int sign = f.u & sign_mask;
@@ -335,17 +397,17 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
#endif
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
return __half2float(h);
#elif defined(EIGEN_HAS_FP16_C)
return _cvtsh_ss(h.x);
#else
- const FP32 magic = { 113 << 23 };
+ const float32_bits magic = { 113 << 23 };
const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
- FP32 o;
+ float32_bits o;
o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits
unsigned int exp = shifted_exp & o.u; // just the exponent
@@ -370,7 +432,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
return (a.x & 0x7fff) == 0x7c00;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
return __hisnan(a);
#else
return (a.x & 0x7fff) > 0x7c00;
@@ -443,7 +505,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
return __hlt(b, a) ? b : a;
#else
const float f1 = static_cast<float>(a);
@@ -452,7 +514,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
return __hlt(a, b) ? b : a;
#else
const float f1 = static_cast<float>(a);
@@ -490,49 +552,6 @@ template<> struct is_arithmetic<half> { enum { value = true }; };
} // end namespace internal
-} // end namespace Eigen
-
-namespace std {
-template<>
-struct numeric_limits<Eigen::half> {
- static const bool is_specialized = true;
- static const bool is_signed = true;
- static const bool is_integer = false;
- static const bool is_exact = false;
- static const bool has_infinity = true;
- static const bool has_quiet_NaN = true;
- static const bool has_signaling_NaN = true;
- static const float_denorm_style has_denorm = denorm_present;
- static const bool has_denorm_loss = false;
- static const std::float_round_style round_style = std::round_to_nearest;
- static const bool is_iec559 = false;
- static const bool is_bounded = false;
- static const bool is_modulo = false;
- static const int digits = 11;
- static const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
- static const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
- static const int radix = 2;
- static const int min_exponent = -13;
- static const int min_exponent10 = -4;
- static const int max_exponent = 16;
- static const int max_exponent10 = 4;
- static const bool traps = true;
- static const bool tinyness_before = false;
-
- static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); }
- static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
- static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
- static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); }
- static Eigen::half round_error() { return Eigen::half(0.5); }
- static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
- static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
- static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
- static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); }
-};
-}
-
-namespace Eigen {
-
template<> struct NumTraits<Eigen::half>
: GenericNumTraits<Eigen::half>
{
@@ -607,14 +626,18 @@ struct hash<Eigen::half> {
// Add the missing shfl_xor intrinsic
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
+ #if EIGEN_CUDACC_VER < 90000
return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
+ #else
+ return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
+ #endif
}
#endif
-// ldg() has an overload for __half, but we also need one for Eigen::half.
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
return Eigen::half_impl::raw_uint16_to_half(
__ldg(reinterpret_cast<const unsigned short*>(ptr)));
@@ -622,7 +645,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr)
#endif
-#if defined(__CUDA_ARCH__)
+#if defined(EIGEN_CUDA_ARCH)
namespace Eigen {
namespace numext {
diff --git a/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
index 943e0b0..c66d384 100644
--- a/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@@ -99,7 +99,8 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2&
template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
half2 result;
- result.x = a.x & 0x7FFF7FFF;
+ unsigned temp = *(reinterpret_cast<const unsigned*>(&(a)));
+ *(reinterpret_cast<unsigned*>(&(result))) = temp & 0x7FFF7FFF;
return result;
}
diff --git a/eigen/Eigen/src/Core/arch/SSE/PacketMath.h b/eigen/Eigen/src/Core/arch/SSE/PacketMath.h
index 5e652cc..60e2517 100644
--- a/eigen/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/eigen/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -28,7 +28,7 @@ namespace internal {
#endif
#endif
-#if (defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004)
+#if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004)) || EIGEN_OS_QNX
// With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
// have overloads for both types without linking error.
// One solution is to increase ABI version using -fabi-version=4 (or greater).